diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 4bc53f86fc08..95e897820fec 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,10 +19,12 @@ We do encourage everyone to work anything they are interested in.
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
+- [Jared Roesch](https://github.com/jroesch): @jroesch - relay
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Yao Wang](https://github.com/kevinthesun): @kevinthesun: - topi, vision
 - [Eddie Yan](https://github.com/eqy): @eqy - runtime, autotvm, rpc, topi
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, topi, relay
 
@@ -32,6 +34,7 @@ We do encourage everyone to work anything they are interested in.
 - [Tianqi Chen](https://github.com/tqchen): @tqchen
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
 - [Nick Hynes](https://github.com/nhynes): @nhynes
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
 - [Yizhi Liu](https://github.com/yzhliu) : @yzhliu
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 3a1f8a2160ee..41e9bf621cb6 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -6,7 +6,7 @@ PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
-PKG_LDFLAGS =-L${TVM_ROOT}/lib
+PKG_LDFLAGS =-L${TVM_ROOT}/build
 UNAME_S := $(shell uname -s)
 
 ifeq ($(UNAME_S), Darwin)
diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 25286f67b4f5..78b407ae9aa1 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -31,7 +31,7 @@ def __init__(self, handle):
     def __del__(self):
         # You can also call your own customized
         # deleter if you can free it via your own FFI.
-        tvm.nd.free_extension_handle(self.handle, 17)
+        tvm.nd.free_extension_handle(self.handle, self.__class__._tvm_tcode)
 
     @property
     def _tvm_handle(self):
@@ -42,3 +42,30 @@ def __getitem__(self, idx):
 
 # Register IntVec extension on python side.
 tvm.register_extension(IntVec, IntVec)
+
+
+nd_create = tvm.get_global_func("tvm_ext.nd_create")
+nd_add_two = tvm.get_global_func("tvm_ext.nd_add_two")
+nd_get_addtional_info = tvm.get_global_func("tvm_ext.nd_get_addtional_info")
+
+class NDSubClass(tvm.nd.NDArrayBase):
+    """Example for subclassing TVM's NDArray infrastructure.
+
+    By inheriting TMV's NDArray, external libraries could
+    leverage TVM's FFI without any modification.
+    """
+    # Should be consistent with the type-trait set in the backend
+    _array_type_code = 1
+
+    @staticmethod
+    def create(addtional_info):
+        return nd_create(addtional_info)
+
+    @property
+    def addtional_info(self):
+        return nd_get_addtional_info(self)
+
+    def __add__(self, other):
+        return nd_add_two(self, other)
+
+tvm.register_extension(NDSubClass, NDSubClass)
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 362ac62dea3d..97e0ada25a2e 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -7,24 +7,87 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/packed_func_ext.h>
+#include <tvm/runtime/device_api.h>
 
 namespace tvm_ext {
 using IntVector = std::vector<int>;
+class NDSubClass;
 }  // namespace tvm_ext
 
 namespace tvm {
 namespace runtime {
 template<>
-struct extension_class_info<tvm_ext::IntVector> {
+struct extension_type_info<tvm_ext::IntVector> {
   static const int code = 17;
 };
+template<>
+struct array_type_info<tvm_ext::NDSubClass> {
+  static const int code = 1;
+};
 }  // namespace tvm
 }  // namespace runtime
 
 using namespace tvm;
 using namespace tvm::runtime;
 
+namespace tvm_ext {
+/*!
+ * \brief A subclass of TVM's NDArray.
+ *
+ * To use this extension, an external library should
+ *
+ * 1) Inherit TVM's NDArray and NDArray container,
+ *    and define the trait `array_type_info` for this class.
+ *
+ * 2) Define a constructor in the inherited class that accepts
+ *    a pointer to TVM's Container, which is nullable.
+ *
+ * 3) On Python frontend, inherit `tvm.nd.NDArrayBase`,
+ *    define the class attribute `_array_type_code` consistent to
+ *    the C++ type trait, and register the subclass using `tvm.register_extension`.
+ */
+class NDSubClass : public tvm::runtime::NDArray {
+ public:
+  class SubContainer : public NDArray::Container {
+   public:
+    SubContainer(int addtional_info) :
+      addtional_info_(addtional_info) {
+      array_type_code_ = array_type_info<NDSubClass>::code;
+    }
+    static bool Is(NDArray::Container *container) {
+      SubContainer *c = static_cast<SubContainer*>(container);
+      return c->array_type_code_ == array_type_info<NDSubClass>::code;
+    }
+    int addtional_info_{0};
+  };
+  NDSubClass(NDArray::Container *container) {
+    if (container == nullptr) {
+      data_ = nullptr;
+      return;
+    }
+    CHECK(SubContainer::Is(container));
+    container->IncRef();
+    data_ = container;
+  }
+  ~NDSubClass() {
+    this->reset();
+  }
+  NDSubClass AddWith(const NDSubClass &other) const {
+    SubContainer *a = static_cast<SubContainer*>(data_);
+    SubContainer *b = static_cast<SubContainer*>(other.data_);
+    CHECK(a != nullptr && b != nullptr);
+    return NDSubClass(new SubContainer(a->addtional_info_ + b->addtional_info_));
+  }
+  int get_additional_info() const {
+    SubContainer *self = static_cast<SubContainer*>(data_);
+    CHECK(self != nullptr);
+    return self->addtional_info_;
+  }
+};
+}  // namespace tvm_ext
+
 namespace tvm_ext {
 
 TVM_REGISTER_EXT_TYPE(IntVector);
@@ -64,6 +127,26 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))();
   });
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_create")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  int addtional_info = args[0];
+  *rv = NDSubClass(new NDSubClass::SubContainer(addtional_info));
+});
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  NDSubClass a = args[0];
+  NDSubClass b = args[1];
+  *rv = a.AddWith(b);
+});
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_get_addtional_info")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  NDSubClass a = args[0];
+  *rv = a.get_additional_info();
+});
+
 }  // namespace tvm_ext
 
 // External function exposed to runtime.
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index def30803135e..a6246d6be2e1 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -32,6 +32,7 @@ def test_sym_add():
     c = tvm_ext.sym_add(a, b)
     assert c.a == a and c.b == b
 
+
 def test_ext_vec():
     ivec = tvm_ext.ivec_create(1, 2, 3)
     assert(isinstance(ivec, tvm_ext.IntVec))
@@ -44,6 +45,7 @@ def ivec_cb(v2):
 
     tvm.convert(ivec_cb)(ivec)
 
+
 def test_extract_ext():
     fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare)
     assert fdict["mul"](3, 4) == 12
@@ -68,7 +70,21 @@ def check_llvm():
     check_llvm()
 
 
+def test_nd_subclass():
+    a = tvm_ext.NDSubClass.create(addtional_info=3)
+    b = tvm_ext.NDSubClass.create(addtional_info=5)
+    c = a + b
+    d = a + a
+    e = b + b
+    assert(a.addtional_info == 3)
+    assert(b.addtional_info == 5)
+    assert(c.addtional_info == 8)
+    assert(d.addtional_info == 6)
+    assert(e.addtional_info == 10)
+
+
 if __name__ == "__main__":
+    test_nd_subclass()
     test_extern_call()
     test_ext_dev()
     test_ext_vec()
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
index 8497761a7116..f2ee945207b6 100644
--- a/cmake/util/FindLLVM.cmake
+++ b/cmake/util/FindLLVM.cmake
@@ -37,8 +37,9 @@ macro(find_llvm use_llvm)
     execute_process(COMMAND ${LLVM_CONFIG} --cxxflags
       OUTPUT_VARIABLE __llvm_cxxflags)
     execute_process(COMMAND ${LLVM_CONFIG} --version
-      COMMAND cut -b 1,3
-      OUTPUT_VARIABLE TVM_LLVM_VERSION)
+      OUTPUT_VARIABLE __llvm_version)
+    # llvm version
+    string(REGEX REPLACE "^([^.]+)\.([^.])+\.[^.]+.*$" "\\1\\2" TVM_LLVM_VERSION ${__llvm_version})
     # definitions
     string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" LLVM_DEFINITIONS ${__llvm_cxxflags})
     # include dir
diff --git a/conda/cross-linux.cmake b/conda/cross-linux.cmake
new file mode 100644
index 000000000000..bb837eea5ba7
--- /dev/null
+++ b/conda/cross-linux.cmake
@@ -0,0 +1,20 @@
+# this one is important
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_PLATFORM Linux)
+#this one not so much
+set(CMAKE_SYSTEM_VERSION 1)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER $ENV{CC})
+
+# where is the target environment
+set(CMAKE_FIND_ROOT_PATH $ENV{PREFIX} $ENV{BUILD_PREFIX}/$ENV{HOST}/sysroot)
+
+# search for programs in the build host directories
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# god-awful hack because it seems to not run correct tests to determine this:
+set(__CHAR_UNSIGNED___EXITCODE 1)
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index 7162fdc8391f..bae06740fd0b 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: nnvm
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
   skip: True  # [win]
 
 requirements:
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index 22a1f3579868..f13c95ac4032 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: topi
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
 
 requirements:
   host:
diff --git a/conda/tvm-libs/build.sh b/conda/tvm-libs/build.sh
index 1ea99fb3dbc6..772838e63ac6 100644
--- a/conda/tvm-libs/build.sh
+++ b/conda/tvm-libs/build.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# Fix for OSX build to hide the clang LLVM
+rm -f ${BUILD_PREFIX}/bin/llvm-config
+rm -rf ${BUILD_PREFIX}/lib/cmake
+
 set -e
 
 if [ -z "$PREFIX" ]; then
@@ -9,13 +13,29 @@ fi
 if [ -z "$cuda" ] || [ "$cuda" == "False" ]; then
     CUDA_OPT=""
 else
-    CUDA_OPT="-DUSE_CUDA=ON"
+    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON"
+fi
+
+if [ "$target_platform" == "osx-64" ]; then
+    # macOS 64 bits
+    METAL_OPT=""  # Conda can only target 10.9 for now
+    TOOLCHAIN_OPT=""
+else
+    METAL_OPT=""
+    if [ "$target_platform" == "linux-64" ]; then
+        # Linux 64 bits
+        TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/../cross-linux.cmake"
+    else
+        # Windows (or 32 bits, which we don't support)
+        METAL_OPT=""
+        TOOLCHAIN_OPT=""
+    fi
 fi
 
 rm -rf build || true
 mkdir -p build
 cd build
-cmake $CUDA_OPT -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" ..
-make -j4 VERBOSE=1
+cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" $TOOLCHAIN_OPT ..
+make -j${CPU_COUNT} VERBOSE=1
 make install
 cd ..
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index 6a2f0ff75f38..fcb8f22cad25 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: tvm-libs
@@ -8,21 +8,17 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
   string: cuda{{ cuda_version }}_{{ PKG_BUILDNUM }}  # [cuda]
 
 requirements:
   build:
-    - {{ compiler('cxx') }}  # [linux]
-    - llvmdev ==6.0.0  # [osx]
-  host:
     # The OS X build will require some manual setup or it will break
-    # See https://conda.io/docs/user-guide/tasks/build-packages/compiler-tools.html#macos-sdk
-    # It is also ass-backward because of llvm brokeness when mixed with the
-    # conda OS X compiler
-    - {{ compiler('cxx') }}  # [osx]
+    # See https://docs.conda.io/projects/conda-build/en/latest/source/resources/compiler-tools.html#macos-sdk
+    - {{ compiler('cxx') }}
+  host:
     - cmake
-    - llvmdev ==6.0.0  # [linux]
+    - llvmdev ==6.0.0
     - zlib  # [linux]
   run:
     - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index b4b93471821a..37adf5b4fe2e 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: tvm
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
 
 requirements:
   build:
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 6a599b1e3917..a83d7000d0fe 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -24,7 +24,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Fix recommonmark to latest version
-RUN git clone https://github.com/rtfd/recommonmark
+RUN git clone --depth=1 https://github.com/rtfd/recommonmark
 RUN cd recommonmark; python3 setup.py install
 
 # Enable doxygen for c++ doc build
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 132e8ebb7df9..0d7b4a410033 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y sudo wget
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 RUN apt-get install -y doxygen graphviz
-RUN pip3 install cpplint pylint mypy
+RUN pip3 install cpplint pylint==1.9.4 mypy
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 460b901bf08f..2d0b45983902 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -45,7 +45,7 @@ RUN echo "Cloning TVM source & submodules"
 ENV TVM_PAR_DIR="/usr"
 RUN mkdir -p TVM_PAR_DIR && \
 	cd ${TVM_PAR_DIR} && \
-	git clone https://github.com/dmlc/tvm --recursive
+	git clone --depth=1 https://github.com/dmlc/tvm --recursive
 #RUN git submodule update --init --recursive
 
 
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
old mode 100644
new mode 100755
index 461ad244d37c..04153559d27e
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr
-git clone https://github.com/dmlc/tvm --recursive
+git clone --depth=1 https://github.com/dmlc/tvm --recursive
 cd /usr/tvm
 echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
old mode 100644
new mode 100755
index 8a1324646fd5..d31e10ce9ab9
--- a/docker/install/install_tvm_gpu.sh
+++ b/docker/install/install_tvm_gpu.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr
-git clone https://github.com/dmlc/tvm --recursive
+git clone --depth=1 https://github.com/dmlc/tvm --recursive
 cd /usr/tvm
 echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
 echo set\(USE_CUDA ON\) >> config.cmake
diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
old mode 100644
new mode 100755
index a5c02e573b43..96fdbe168d6d
--- a/docker/install/ubuntu_install_androidsdk.sh
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -1,13 +1,16 @@
+#!/bin/bash
+
 . /etc/profile
 
 set -o errexit -o nounset
+set -o pipefail
 
 ANDROID_HOME=/opt/android-sdk-linux
 ASDKTOOLS_HOME=/opt/android-sdk-tools
 ASDKTOOLS_VERSION=3859397
 ASDKTOOLS_SHA256=444e22ce8ca0f67353bda4b85175ed3731cae3ffa695ca18119cbacef1c1bea0
 
-wget http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
+wget -q http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
 echo "${ASDKTOOLS_SHA256} *sdk-tools-linux.zip" | sha256sum --check -
 unzip sdk-tools-linux.zip
 rm sdk-tools-linux.zip
@@ -58,7 +61,7 @@ EOF
 
 mkdir /root/.android 2>/dev/null || true
 touch /root/.android/repositories.cfg
-yes | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
+(yes || true) | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
 sdkmanager --verbose --package_file=/install/package-list-minimal.txt --sdk_root="$ANDROID_HOME"
 test -d "${ANDROID_HOME}/build-tools/27.0.3"
 test -d "${ANDROID_HOME}/ndk-bundle"
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
old mode 100644
new mode 100755
index d2f2d6a8c48f..6dae3ae12d56
--- a/docker/install/ubuntu_install_antlr.sh
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr/local/lib
-wget https://www.antlr.org/download/antlr-4.7.1-complete.jar
+wget -q https://www.antlr.org/download/antlr-4.7.1-complete.jar
 cd -
diff --git a/docker/install/ubuntu_install_caffe2.sh b/docker/install/ubuntu_install_caffe2.sh
old mode 100644
new mode 100755
index 5fe827927e87..bb9322704918
--- a/docker/install/ubuntu_install_caffe2.sh
+++ b/docker/install/ubuntu_install_caffe2.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 python3 -m caffe2.python.models.download -i -f squeezenet
 python3 -m caffe2.python.models.download -i -f resnet50
 python3 -m caffe2.python.models.download -i -f vgg19
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
old mode 100644
new mode 100755
index efc69c946b97..c7e2918971fd
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install libraries for building c++ core on ubuntu
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends \
         git make libgtest-dev cmake wget unzip libtinfo-dev libz-dev\
         libcurl4-openssl-dev libopenblas-dev g++ sudo
 
diff --git a/docker/install/ubuntu_install_coreml.sh b/docker/install/ubuntu_install_coreml.sh
old mode 100644
new mode 100755
index 4b0fd126c61d..51afc1423961
--- a/docker/install/ubuntu_install_coreml.sh
+++ b/docker/install/ubuntu_install_coreml.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install coremltools
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
old mode 100644
new mode 100755
index f5e0c2791d80..5c350b848bf7
--- a/docker/install/ubuntu_install_darknet.sh
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -1,4 +1,10 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 #install the necessary dependancies, cffi, opencv
-wget 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
+wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
 pip2 install opencv-python cffi
 pip3 install opencv-python cffi
diff --git a/docker/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
old mode 100644
new mode 100755
index 31470bb69de9..4671c898438a
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -1,11 +1,17 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 alias make="make -j4"
 
 # Get latest cmake
-wget https://cmake.org/files/v3.8/cmake-3.8.2-Linux-x86_64.tar.gz
+wget -q https://cmake.org/files/v3.8/cmake-3.8.2-Linux-x86_64.tar.gz
 tar xf cmake-3.8.2-Linux-x86_64.tar.gz
 export PATH=/cmake-3.8.2-Linux-x86_64/bin/:${PATH}
 
-wget https://s3.amazonaws.com/mozilla-games/emscripten/releases/emsdk-portable.tar.gz
+wget -q https://s3.amazonaws.com/mozilla-games/emscripten/releases/emsdk-portable.tar.gz
 tar xf emsdk-portable.tar.gz
 cd emsdk-portable
 ./emsdk update
diff --git a/docker/install/ubuntu_install_gluoncv.sh b/docker/install/ubuntu_install_gluoncv.sh
old mode 100644
new mode 100755
index 0ca1a34cbc24..adfbdce7c7b1
--- a/docker/install/ubuntu_install_gluoncv.sh
+++ b/docker/install/ubuntu_install_gluoncv.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install gluoncv
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
old mode 100644
new mode 100755
index 2361ccfbd2e4..c29e764cbb3a
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -1,4 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 #install the necessary dependancies for golang build
-apt-get update && apt-get install -y golang-1.10-go
-apt-get update && apt-get install -y golang-1.10-doc
-apt-get update && apt-get install -y golint
+apt-get update
+apt-get install -y golang-1.10-go
+apt-get install -y golang-1.10-doc
+apt-get install -y golint
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
old mode 100644
new mode 100755
index b1535c98cabb..7f62406ca710
--- a/docker/install/ubuntu_install_gradle.sh
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -1,13 +1,16 @@
+#!/bin/bash
+
 . /etc/profile
 
 set -o errexit -o nounset
+set -o pipefail
 
 GRADLE_HOME=/opt/gradle
 GRADLE_VERSION=4.10-rc-2
 GRADLE_SHA256=e90d3c32910e259814bcca82b3911172ecca1ff1ab5ed69b4de3c1df8b378b40
 
 echo "Downloading Gradle"
-wget --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
+wget -q --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
 echo "Checking Gradle hash"
 echo "${GRADLE_SHA256} *gradle.zip" | sha256sum --check -
 echo "Installing Gradle"
diff --git a/docker/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
old mode 100644
new mode 100755
index bf7a0001dc70..2304f697affd
--- a/docker/install/ubuntu_install_iverilog.sh
+++ b/docker/install/ubuntu_install_iverilog.sh
@@ -1,5 +1,11 @@
-apt-get install -y --no-install-recommends --force-yes make bison flex
-wget ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get install -y --no-install-recommends make bison flex
+wget -q ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
 tar xf verilog-10.1.tar.gz
 cd verilog-10.1
 ./configure --prefix=/usr
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
old mode 100644
new mode 100755
index 462edc491627..e1f431bee845
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -1,4 +1,8 @@
+#!/bin/bash
+
 set -o errexit -o nounset
+set -o pipefail
+
 apt-get update && apt-get install -y openjdk-8-jdk maven
 test -d "/usr/lib/jvm/java-8-openjdk-amd64/jre"
 echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /etc/profile
diff --git a/docker/install/ubuntu_install_keras.sh b/docker/install/ubuntu_install_keras.sh
old mode 100644
new mode 100755
index 33bc38c80972..b689949d0dff
--- a/docker/install/ubuntu_install_keras.sh
+++ b/docker/install/ubuntu_install_keras.sh
@@ -1,2 +1,8 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip2 install keras tensorflow h5py
 pip3 install keras tensorflow h5py
diff --git a/docker/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
old mode 100644
new mode 100755
index 16d0fe150b7e..a562c3258628
--- a/docker/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
      >> /etc/apt/sources.list.d/llvm.list
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
@@ -18,5 +24,5 @@ echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
-wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
+wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
+apt-get update && apt-get install -y llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
diff --git a/docker/install/ubuntu_install_mxnet.sh b/docker/install/ubuntu_install_mxnet.sh
old mode 100644
new mode 100755
index 0e7e9e3939a8..a15dca7def07
--- a/docker/install/ubuntu_install_mxnet.sh
+++ b/docker/install/ubuntu_install_mxnet.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install mxnet
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
old mode 100644
new mode 100755
index 83225d4aa820..1cf044a9b257
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -1,11 +1,13 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes git cmake
+#!/bin/bash
 
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends git cmake
 
-git clone https://github.com/Maratyszcza/NNPACK NNPACK
-cd NNPACK
 # TODO: specific tag?
-git checkout 1e005b0c2
-cd -
+git clone --branch=1e005b0c2 --depth=1 https://github.com/Maratyszcza/NNPACK NNPACK
 
 mkdir -p NNPACK/build
 cd NNPACK/build
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
old mode 100644
new mode 100755
index 75d367dfa98f..dfdd0432e4db
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -1,4 +1,16 @@
-apt-get update && apt-get install -y curl
-curl -sL https://deb.nodesource.com/setup_6.x | bash -
-apt-get update && apt-get install -y nodejs
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update
+apt-get install -y curl
+
+# The node install script fetched and executed here will update the
+# apt source list, hence the second apt-get update is necessary.
+curl -s -S -L https://deb.nodesource.com/setup_6.x | bash -
+apt-get update
+apt-get install -y nodejs
+
 npm install eslint jsdoc ws
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
old mode 100644
new mode 100755
index 517ea77ab81e..2778a2489667
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # fix to certain version for now
 pip2 install onnx>=1.1.0
 pip3 install onnx>=1.1.0
diff --git a/docker/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
old mode 100644
new mode 100755
index ca4d1d04fd5c..f16de615c4b1
--- a/docker/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Install OpenCL runtime in nvidia docker.
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends \
         ocl-icd-opencl-dev \
         clinfo && \
     rm -rf /var/lib/apt/lists/*
diff --git a/docker/install/ubuntu_install_opengl.sh b/docker/install/ubuntu_install_opengl.sh
old mode 100644
new mode 100755
index f8be6e351581..82050c14f307
--- a/docker/install/ubuntu_install_opengl.sh
+++ b/docker/install/ubuntu_install_opengl.sh
@@ -1,4 +1,10 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 apt-get update --fix-missing
 
-apt-get install -y --no-install-recommends --force-yes \
-        libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
+apt-get install -y --no-install-recommends \
+        libgl1-mesa-dev libglfw3-dev
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
old mode 100644
new mode 100755
index a34019e1003e..43c27b1b2def
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -1,12 +1,21 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install python and pip, don't modify this, modify install_python_package.sh
-apt-get update && apt-get install -y python-dev
+apt-get update
+apt-get install -y python-dev
 
 # python 3.6
-apt-get update && yes | apt-get install software-properties-common
-add-apt-repository ppa:jonathonf/python-3.6 &&\
-    apt-get update && apt-get install -y python-pip python-dev python3.6 python3.6-dev
+apt-get install -y software-properties-common
+
+add-apt-repository ppa:jonathonf/python-3.6
+apt-get update
+apt-get install -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
 # Install pip
-cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
+cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
old mode 100644
new mode 100755
index da8ade668619..200fe6e47781
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install libraries for python package on ubuntu
-pip2 install nose pylint six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
-pip3 install nose pylint six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
+pip2 install nose pylint==1.9.4 six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
+pip3 install nose pylint==1.9.4 six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
old mode 100644
new mode 100755
index dfc9a3c381b6..d079170b0536
--- a/docker/install/ubuntu_install_redis.sh
+++ b/docker/install/ubuntu_install_redis.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 apt-get update && apt-get install -y redis-server
 pip2 install xgboost psutil
 pip3 install xgboost psutil
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
old mode 100644
new mode 100755
index d050c20078b8..be7f2364bf63
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -1,4 +1,10 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Install ROCm cross compilation toolchain.
 wget -qO - http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add -
 echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list
-apt-get update && apt-get install -y --force-yes rocm-dev
+apt-get update && apt-get install -y rocm-dev
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
old mode 100644
new mode 100755
index fed63d58a27b..67bcd15cbc84
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -1,9 +1,15 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes curl
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends curl
 
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
+curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
old mode 100644
new mode 100755
index ca5f517849d8..d2958e5d0893
--- a/docker/install/ubuntu_install_sgx.sh
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -1,21 +1,26 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends \
     build-essential git cmake \
     wget python pkg-config software-properties-common \
     autoconf automake libtool ocaml \
     protobuf-compiler libprotobuf-dev \
     libssl-dev libcurl4-openssl-dev curl
 
-git clone https://github.com/intel/linux-sgx.git
+git clone --branch=sgx_2.2 --depth=1 https://github.com/intel/linux-sgx.git
 cd linux-sgx
-git checkout sgx_2.2
-curl 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
+curl -s -S -L 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
 ./download_prebuilt.sh
 make -j4 sdk && make -j4 sdk_install_pkg
 ./linux/installer/bin/sgx_linux_x64_sdk*.bin --prefix /opt
 cd -
 
-git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
+tag=6098af # v1.0.5
+git clone --branch=$tag --depth=1 https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
 cd /opt/rust-sgx-sdk
-git checkout 6098af # v1.0.5
-curl 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
+curl -s -S -L 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
 cd -
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
old mode 100644
new mode 100755
index ba04c2e25e6f..50e1e92796c3
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install sphinx sphinx-gallery sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
old mode 100644
new mode 100755
index b773fcfb027b..4fdf9c0d46ab
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install tensorflow
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
old mode 100644
new mode 100755
index 97235c4644f5..ed8ea1deff3f
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Download, build and install flatbuffers
-git clone --recursive https://github.com/google/flatbuffers.git
+git clone --depth=1 --recursive https://github.com/google/flatbuffers.git
 cd flatbuffers
 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release
 make install -j8
@@ -13,7 +19,7 @@ pip2 install flatbuffers
 # Setup tflite from schema
 mkdir tflite
 cd tflite
-wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
+wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
 flatc --python schema.fbs
 
 cat <<EOM >setup.py
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
old mode 100644
new mode 100755
index a4155da49651..6772b029cc90
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -1,6 +1,10 @@
-#/bin/bash
+#!/bin/bash
 
-wget https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
+set -e
+set -u
+set -o pipefail
+
+wget -q https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
 
 bash vulkansdk-linux-x86_64-1.0.65.0.run
 mv VulkanSDK /usr/local/VulkanSDK
diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst
index 5cbc250b55ba..a6085c3232ef 100644
--- a/docs/api/python/relay/backend.rst
+++ b/docs/api/python/relay/backend.rst
@@ -3,9 +3,6 @@ tvm.relay.backend
 
 .. automodule:: tvm.relay.backend
 
-Interpreter
------------
-
 .. automodule:: tvm.relay.backend.interpreter
     :members:
 
diff --git a/docs/api/python/relay/base.rst b/docs/api/python/relay/base.rst
index 72315dca0193..f2d0db409100 100644
--- a/docs/api/python/relay/base.rst
+++ b/docs/api/python/relay/base.rst
@@ -12,5 +12,8 @@ tvm.relay.base
 .. autoclass:: tvm.relay.base.Span
     :members:
 
+.. autoclass:: tvm.relay.base.SourceName
+    :members:
+
 .. autoclass:: tvm.relay.base.Id
     :members:
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
index a278940f0fd5..b33f1870d5a5 100644
--- a/docs/api/python/relay/build_module.rst
+++ b/docs/api/python/relay/build_module.rst
@@ -5,6 +5,8 @@ tvm.relay.build_module
 
 .. autofunction:: tvm.relay.build_module.build
 
+.. autofunction:: tvm.relay.build_module.build_config
+
 .. autofunction:: tvm.relay.build_module.optimize
 
 .. autofunction:: tvm.relay.build_module.create_executor
diff --git a/docs/api/python/relay/expr.rst b/docs/api/python/relay/expr.rst
index 540d6bfbab65..c21e583f042b 100644
--- a/docs/api/python/relay/expr.rst
+++ b/docs/api/python/relay/expr.rst
@@ -39,15 +39,21 @@ tvm.relay.expr
 .. autoclass:: tvm.relay.expr.TupleGetItem
     :members:
 
-.. autoclass:: tvm.relay.expr.TempExpr
+.. autoclass:: tvm.relay.expr.RefCreate
+    :members:
+
+.. autoclass:: tvm.relay.expr.RefRead
+    :members:
+
+.. autoclass:: tvm.relay.expr.RefWrite
     :members:
 
-.. autoclass:: tvm.relay.expr.ExprFunctor
+.. autoclass:: tvm.relay.expr.TupleGetItem
     :members:
 
-.. autoclass:: tvm.relay.expr.ExprMutator
+.. autoclass:: tvm.relay.expr.TempExpr
     :members:
 
 .. autoclass:: tvm.relay.expr.TupleWrapper
-    :members
+    :members:
 
diff --git a/docs/api/python/relay/frontend.rst b/docs/api/python/relay/frontend.rst
index 054d3cecc1c5..2a22982a1cdf 100644
--- a/docs/api/python/relay/frontend.rst
+++ b/docs/api/python/relay/frontend.rst
@@ -9,3 +9,11 @@ tvm.relay.frontend
 .. autofunction:: tvm.relay.frontend.from_keras
 
 .. autofunction:: tvm.relay.frontend.from_onnx
+
+.. autofunction:: tvm.relay.frontend.from_tflite
+
+.. autofunction:: tvm.relay.frontend.from_coreml
+
+.. autofunction:: tvm.relay.frontend.from_caffe2
+
+.. autofunction:: tvm.relay.frontend.from_tensorflow
diff --git a/docs/api/python/relay/image.rst b/docs/api/python/relay/image.rst
index 223213eca8e3..862dcbbd1fc7 100644
--- a/docs/api/python/relay/image.rst
+++ b/docs/api/python/relay/image.rst
@@ -5,5 +5,5 @@ tvm.relay.image
 .. automodule:: tvm.relay.image
     :members:
 
-.. automodule:: tvm.relay.op.image.image
+.. automodule:: tvm.relay.op.image
     :members:
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
index 7413a818f73f..36a7aa00d7b7 100644
--- a/docs/api/python/relay/op.rst
+++ b/docs/api/python/relay/op.rst
@@ -3,8 +3,27 @@ tvm.relay.op
 .. automodule:: tvm.relay.op
     :members:
 
-.. automodule:: tvm.relay.op.op
-    :members:
+.. autofunction:: tvm.relay.op.Op
+
+.. autofunction:: tvm.relay.op.OpPattern
+
+.. autofunction:: tvm.relay.op.get
+
+.. autofunction:: tvm.relay.op.register
+
+.. autofunction:: tvm.relay.op.register_schedule
+
+.. autofunction:: tvm.relay.op.register_pattern
+
+.. autofunction:: tvm.relay.op.register_compute
+
+.. autofunction:: tvm.relay.op.register_gradient
+
+.. autofunction:: tvm.relay.op.register_alter_op_layout
+
+.. autofunction:: tvm.relay.op.schedule_injective
+
+.. autofunction:: tvm.relay.op.debug
 
 .. automodule:: tvm.relay.op.reduce
     :members:
@@ -15,11 +34,10 @@ tvm.relay.op
 .. automodule:: tvm.relay.op.transform
     :members:
 
-.. automodule:: tvm.relay.op.nn.nn
+.. automodule:: tvm.relay.op.nn
     :members:
 
 .. automodule:: tvm.relay.op.vision.multibox
     :members:
 
-.. automodule:: tvm.relay.op.vision.nms
-    :members:
+.. autofunction:: tvm.relay.vision.nms
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 856bad198e88..06f4f0d61f34 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -41,6 +41,7 @@ List of operators
    topi.nn.upsampling
    topi.nn.softmax
    topi.nn.dense
+   topi.nn.batch_matmul
    topi.nn.log_softmax
    topi.nn.conv2d_nchw
    topi.nn.conv2d_hwcn
@@ -67,6 +68,14 @@ List of operators
    topi.not_equal
    topi.greater_equal
    topi.less_equal
+   topi.logical_and
+   topi.logical_or
+   topi.logical_not
+   topi.arange
+   topi.stack
+   topi.repeat
+   topi.tile
+   topi.layout_transform
    topi.image.resize
 
 
@@ -123,6 +132,11 @@ topi
 .. autofunction:: topi.power
 .. autofunction:: topi.greater
 .. autofunction:: topi.less
+.. autofunction:: topi.arange
+.. autofunction:: topi.stack
+.. autofunction:: topi.repeat
+.. autofunction:: topi.tile
+.. autofunction:: topi.layout_transform
 
 topi.nn
 ~~~~~~~
@@ -134,6 +148,7 @@ topi.nn
 .. autofunction:: topi.nn.upsampling
 .. autofunction:: topi.nn.softmax
 .. autofunction:: topi.nn.dense
+.. autofunction:: topi.nn.batch_matmul
 .. autofunction:: topi.nn.log_softmax
 .. autofunction:: topi.nn.conv2d_nchw
 .. autofunction:: topi.nn.conv2d_hwcn
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index 971d2d9a1cfb..52c9149d453f 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -25,7 +25,7 @@ Committers are individuals who are granted the write access to the project. A co
 - Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
 - Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.
 
-The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to identify new candidates outside of their own organization.
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to only nominate new candidates outside of their own organization.
 
 
 Reviewers
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 039ef65c7b13..ec693dc260e5 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -52,6 +52,12 @@ C++
 
 Python
 ^^^^^^
+Necessary dependencies:
+
+.. code:: bash
+
+  pip install --user nose Cython
+
 If you want to run all tests:
 
 .. code:: bash
@@ -72,4 +78,4 @@ If you want to run a single test:
   export PYTHONPATH=python:topi/python
   rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
-  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py
\ No newline at end of file
+  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 81d06f1dc27f..5c828957cc79 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -5,7 +5,7 @@ Install from Source
 This page gives instructions on how to build and install the tvm package from
 scratch on various systems. It consists of two steps:
 
-1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
+1. First build the shared library from the C++ codes (`libtvm.so` for linux, `libtvm.dylib` for macOS and `libtvm.dll` for windows).
 2. Setup for the language packages (e.g. Python Package).
 
 To get started, clone tvm repo from github. It is important to clone the submodules along, with ``--recursive`` option.
@@ -28,7 +28,7 @@ Build the Shared Library
 Our goal is to build the shared libraries:
 
 - On Linux the target library are `libtvm.so, libtvm_topi.so`
-- On OSX the target library are `libtvm.dylib, libtvm_topi.dylib`
+- On macOS the target library are `libtvm.dylib, libtvm_topi.dylib`
 - On Windows the target library are `libtvm.dll, libtvm_topi.dll`
 
 
@@ -60,7 +60,7 @@ The configuration of tvm can be modified by `config.cmake`.
 
 - Edit ``build/config.cmake`` to customize the compilation options
 
-  - On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
+  - On macOS, for some versions of Xcode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
   - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
     (OpenCL, RCOM, METAL, VULKAN, ...).
 
@@ -162,7 +162,7 @@ Python dependencies
 
    .. code:: bash
 
-       pip install --user numpy decorator
+       pip install --user numpy decorator attrs
 
    * If you want to use RPC Tracker
 
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e1f38c61eb1f..f20c443e8404 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -95,6 +95,10 @@ This level enables additional math and transform operators.
    tvm.relay.full_like
    tvm.relay.cast
    tvm.relay.split
+   tvm.relay.arange
+   tvm.relay.stack
+   tvm.relay.repeat
+   tvm.relay.tile
 
 
 **Level 4: Broadcast and Reductions**
@@ -110,6 +114,9 @@ This level enables additional math and transform operators.
    tvm.relay.greater_equal
    tvm.relay.less
    tvm.relay.less_equal
+   tvm.relay.logical_and
+   tvm.relay.logical_or
+   tvm.relay.logical_not
    tvm.relay.maximum
    tvm.relay.minimum
    tvm.relay.power
@@ -134,6 +141,7 @@ This level enables additional math and transform operators.
    tvm.relay.vision.multibox_prior
    tvm.relay.vision.multibox_transform_loc
    tvm.relay.vision.nms
+   tvm.relay.vision.yolo_reorg
 
 
 **Level 10: Temporary Operators**
@@ -150,6 +158,7 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.device_copy
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
+   tvm.relay.nn.batch_matmul
 
 
 Level 1 Definitions
@@ -216,6 +225,10 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.full_like
 .. autofunction:: tvm.relay.cast
 .. autofunction:: tvm.relay.split
+.. autofunction:: tvm.relay.arange
+.. autofunction:: tvm.relay.stack
+.. autofunction:: tvm.relay.repeat
+.. autofunction:: tvm.relay.tile
 
 
 Level 4 Definitions
@@ -228,6 +241,9 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.greater_equal
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
+.. autofunction:: tvm.relay.logical_and
+.. autofunction:: tvm.relay.logical_or
+.. autofunction:: tvm.relay.logical_not
 .. autofunction:: tvm.relay.maximum
 .. autofunction:: tvm.relay.minimum
 .. autofunction:: tvm.relay.power
@@ -249,6 +265,7 @@ Level 5 Definitions
 .. autofunction:: tvm.relay.vision.multibox_prior
 .. autofunction:: tvm.relay.vision.multibox_transform_loc
 .. autofunction:: tvm.relay.vision.nms
+.. autofunction:: tvm.relay.vision.yolo_reorg
 
 
 Level 10 Definitions
@@ -260,3 +277,4 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.device_copy
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
+.. autofunction:: tvm.relay.nn.batch_matmul
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 717ce985e002..f05eed3308b3 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -35,6 +35,9 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.exp
    nnvm.symbol.log
    nnvm.symbol.sqrt
+   nnvm.symbol.logical_and
+   nnvm.symbol.logical_or
+   nnvm.symbol.logical_not
    nnvm.symbol.elemwise_add
    nnvm.symbol.elemwise_sub
    nnvm.symbol.elemwise_mul
@@ -172,6 +175,9 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.exp
 .. autofunction:: nnvm.symbol.log
 .. autofunction:: nnvm.symbol.sqrt
+.. autofunction:: nnvm.symbol.logical_and
+.. autofunction:: nnvm.symbol.logical_or
+.. autofunction:: nnvm.symbol.logical_not
 .. autofunction:: nnvm.symbol.elemwise_add
 .. autofunction:: nnvm.symbol.elemwise_sub
 .. autofunction:: nnvm.symbol.elemwise_mul
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index cc9e5374b888..d023f8f1cf7e 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -9,14 +9,317 @@
 #include <vector>
 #include <unordered_map>
 #include <memory>
+#include <limits>
 #include "expr.h"
 
 namespace tvm {
-
+// forward delcare Tensor
 class Tensor;
-
 /*! \brief namespace of arithmetic */
 namespace arith {
+//-------------------------------------------------------
+// Base integer analysis API.
+//
+// We have multiple type of analyzers to do relaxed
+// integer set analysis(bound analysis, modulo) and
+// equivalence checking and simplification.
+//
+// Importantly, each analyzer may need result from
+// another analyzer.
+//-------------------------------------------------------
+
+// Forward declare Analyzer
+class Analyzer;
+/*!
+ * \brief reference class to ConstIntBoundNode
+ * \sa ConstIntBoundNode
+ */
+class ConstIntBound;
+/*!
+ * \brief Constant integer up and lower bound(inclusive).
+ *  Useful for value bound analysis.
+ *
+ *  set = [min_value, max_value]
+ */
+class ConstIntBoundNode : public Node {
+ public:
+  int64_t min_value;
+  int64_t max_value;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("min_value", &min_value);
+    v->Visit("max_value", &max_value);
+  }
+
+  TVM_DLL static ConstIntBound make(int64_t min_value, int64_t max_value);
+
+  /*! \brief Number to represent +inf */
+  static const constexpr int64_t kPosInf = std::numeric_limits<int64_t>::max();
+  /*!
+   * \brief Number to represent -inf
+   * \note We can make use the of fact that -kPosInf == kNegInf in the project.
+   */
+  static const constexpr int64_t kNegInf = -kPosInf;
+
+  static constexpr const char* _type_key = "arith.ConstIntBound";
+  TVM_DECLARE_NODE_TYPE_INFO(ConstIntBoundNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(ConstIntBound, ConstIntBoundNode);
+
+/*!
+ * \brief Analyzer to get constant integer bound over expression.
+ */
+class ConstIntBoundAnalyzer {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  ConstIntBound operator()(const Expr& expr);
+
+  /*!
+   * \brief Update constant int bound information of var.
+   *
+   * \param var The variable of interest.
+   * \param info The bound information.
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const ConstIntBound& info,
+              bool override = false);
+  /*!
+   * \brief Bind variable to a range.
+   *
+   * \param var The variable.
+   * \param range The range we bind to.
+   */
+  void Bind(const Var& var, const Range& range);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit ConstIntBoundAnalyzer(Analyzer* parent);
+  ~ConstIntBoundAnalyzer();
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
+  std::function<void()> EnterConstraint(const Expr& constraint);
+  struct Entry;
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
+/*!
+ * \brief reference of ModularSetNode
+ * \sa ModularSetNode
+ */
+class ModularSet;
+/*!
+ * \brief Range of a linear integer function.
+ *  Use to do specify the possible index values.
+ *
+ *  set = { coeff * x + base | x in Z }
+ *
+ *  When coeff != 0, it can also be written as
+ *  set = { n | n % coeff == base }
+ *
+ *  This is useful to decide if the index is dividable by certain value.
+ *  For example, if index = 0 + 4 x, then we know it can be divided by 4.
+ */
+class ModularSetNode : public Node {
+ public:
+  /*! \brief linear co-efficient */
+  int64_t coeff;
+  /*! \brief The base */
+  int64_t base;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("coeff", &coeff);
+    v->Visit("base", &base);
+  }
+
+  TVM_DLL static ModularSet make(int64_t coeff, int64_t base);
+
+  static constexpr const char* _type_key = "arith.ModularSet";
+  TVM_DECLARE_NODE_TYPE_INFO(ModularSetNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(ModularSet, ModularSetNode);
+
+/*!
+ * \brief Analyzer to get modular information over expression.
+ */
+class ModularSetAnalyzer {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  ModularSet operator()(const Expr& expr);
+  /*!
+   * \brief Update constant int bound information of var.
+   *
+   * \param var The variable of interest.
+   * \param info The bound information.
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const ModularSet& info,
+              bool override = false);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit ModularSetAnalyzer(Analyzer* parent);
+  ~ModularSetAnalyzer();
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
+  std::function<void()> EnterConstraint(const Expr& constraint);
+  struct Entry;
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
+/*!
+ * \brief Rewrite-rule based simplifier.
+ */
+class RewriteSimplifier {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  Expr operator()(const Expr& expr);
+
+  /*!
+   * \brief Update binding of var to a new expression.
+   *
+   * \param var The variable of interest.
+   * \param new_expr
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const Expr& new_expr,
+              bool override = false);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit RewriteSimplifier(Analyzer* parent);
+  ~RewriteSimplifier();
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
+/*!
+ * \brief A RAII constraint context.
+ *
+ * \code
+ *
+ *  Var("x");
+ *  arith::Analyzer analyzer;
+ *  {
+ *    arith::ConstraintContext cctx(&analyzer, x % 3 == 0);
+ *    CHECK_EQ(analyzer.modular_set(x)->coeff, 3);
+ *  }
+ *  // constraint no longer in effect.
+ *  CHECK_NE(analyzer.modular_set(x)->coeff, 3);
+ *
+ * \endcode
+ */
+class ConstraintContext {
+ public:
+  /*!
+   * \brief Construct a constraint context.
+   * \param analyzer The analyzer.
+   * \param constraint The constraint to be applied.
+   */
+  ConstraintContext(Analyzer* analyzer, const Expr& constraint) DMLC_THROW_EXCEPTION;
+  /*! \brief destructor */
+  ~ConstraintContext() DMLC_THROW_EXCEPTION {
+    exit_();
+  }
+
+ private:
+  /*! \brief function to be called in recovery */
+  std::function<void()> exit_;
+};
+
+/*!
+ * \brief Analyzer that contains bunch of sub-analyzers.
+ *
+ * Each sub-analyzer can make use of another sub-analyzer
+ * by weak reference of this.
+ *
+ * NOTE for sub-analyzer developers:
+ * If the analyzer uses memoization, we need to clear the internal
+ * cache when information about a Var has been overrideen.
+ */
+class Analyzer {
+ public:
+  /*! \brief sub-analyzer: const integer bound */
+  ConstIntBoundAnalyzer const_int_bound;
+  /*! \brief sub-analyzer: modular set */
+  ModularSetAnalyzer modular_set;
+  /*! \brief sub-analyzer rewrite simplfy */
+  RewriteSimplifier rewrite_simplify;
+  /*! \brief constructor */
+  Analyzer();
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to expr.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param expr The expression we bind to.
+   */
+  void Bind(const VarExpr& var, const Expr& expr);
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to a range.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param range The range we bind to.
+   */
+  void Bind(const VarExpr& var, const Range& range);
+  /*!
+   * \brief Whether can we proof expr >= val.
+
+   *  Non-negative proof is very useful in integer analysis
+   *  to lower divisions and mods given difference in trunc and ceil mode.
+   *
+   * \param expr The expression.
+   * \param lower_bound The lower bound.
+   * \return Whether we can proof it.
+   *
+   * \note Analyzer will call into sub-analyzers to get the result.
+   */
+  bool CanProveGreaterEqual(const Expr& expr, int64_t lower_bound);
+};
+
+//-----------------------------------------------
+// Integer set abstraction API.
+//
+// This is a API build on top of the base
+// integer analysis API to provide set analysis.
+//------------------------------------------------
 /*!
  * \brief Sign of an expression or set.
  */
@@ -118,42 +421,6 @@ class IntSet : public NodeRef {
   static IntSet interval(Expr min, Expr max);
 };
 
-/*!
- * \brief Range of a linear integer function.
- *  Use to do specify the possible index values.
- *
- *  set = { coeff * x + base | x in Z }
- *
- *  When coeff != 0, it can also be written as
- *  set = { n | n % coeff == base }
- *
- *  This is useful to decide if the index is dividable by certain value.
- *  For example, if index = 0 + 4 x, then we know it can be divided by 4.
- */
-struct ModularEntry {
-  /*! \brief linear co-efficient */
-  int coeff{1};
-  /*! \brief The base */
-  int base{0};
-
-  /*! \return entry represent everything */
-  static ModularEntry everything() {
-    // always safe to set 0 + x, so it can be everything.
-    ModularEntry e;
-    e.coeff = 1;
-    e.base = 0;
-    return e;
-  }
-  /*!
-   * \brief Add two modular entries together to get a new modular entry.
-   * \param a The left operand.
-   * \param b The right operand.
-   * \return The combined modular entry.
-   */
-  static ModularEntry Add(const ModularEntry& a,
-                          const ModularEntry& b);
-};
-
 /*!
  * \brief Base class of all IntSet containers.
  */
@@ -300,24 +567,6 @@ IntSet DeduceBound(Expr v, Expr cond,
  */
 Domain DomainTouched(Stmt body, const Tensor &tensor, bool consider_calls, bool consider_provides);
 
-/*!
- * \brief Evaluate the expression with modular analysis
- * \param e The expression to be evaluated.
- * \param mod_map Map of modular statistics of known variables.
- * \return The ModularEntry covering all possible value of e.
- */
-ModularEntry EvalModular(
-    const Expr& e,
-    const std::unordered_map<const Variable*, ModularEntry>& mod_map);
-
-/*!
- * \brief Same as EvalModular, used by front-end.
- * \param e The expression to be evaluated.
- * \param mod_map Map of modular statistics of known variables.
- * \return A ModularSet covering all possible value of e.
- */
-IntSet EvalModular(const Expr& e,
-                   const Map<Var, IntSet>& mod_map);
 // implementation
 inline const IntSetNode* IntSet::operator->() const {
   return static_cast<const IntSetNode*>(node_.get());
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 2c72db169a2d..d95332c245b7 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -10,7 +10,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tvm/node/container.h"
 
 namespace tvm {
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
new file mode 100644
index 000000000000..3f5cb9a29546
--- /dev/null
+++ b/include/tvm/data_layout.h
@@ -0,0 +1,335 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/data_layout.h
+ * \brief Layout expression to describe the data organization of a tensor.
+ *  And BijectiveLayout to mapping two data layouts between each other.
+ */
+#ifndef TVM_DATA_LAYOUT_H_
+#define TVM_DATA_LAYOUT_H_
+
+#include <tvm/base.h>
+#include <tvm/expr.h>
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+#include "expr_operator.h"
+
+namespace tvm {
+
+class LayoutAxis {
+ public:
+  static const LayoutAxis& Get(const char name);
+
+  // Get the singleton LayoutAxis using itvar->var->name_hint
+  static const LayoutAxis& Get(const IterVar& itvar);
+
+  // Get the singleton LayoutAxis using name[0] (size of name must be 1).
+  static const LayoutAxis& make(const std::string& name);
+
+  inline bool IsPrimal() const { return name_ >= 'A' && name_ <= 'Z'; }
+  inline std::string name() const { return std::string(1, name_); }
+
+  // if current axis is primal, switch the axis to its subordinate one,
+  // else switch to the primal.
+  inline const LayoutAxis& ToDual() const {
+    if (name_ >= 'A' && name_ <= 'Z') {
+      return LayoutAxis::Get(name_ - 'A' + 'a');
+    } else {
+      return LayoutAxis::Get(name_ - 'a' + 'A');
+    }
+  }
+
+  // return the primal axis. If it is already primal, return itself.
+  const LayoutAxis& ToPrimal() const {
+    return IsPrimal() ? *this : ToDual();
+  }
+
+  // return the subordinate axis. If it is already subordinate, return itself.
+  const LayoutAxis& ToSubordinate() const {
+    return IsPrimal() ? ToDual() : *this;
+  }
+
+  inline bool operator==(const LayoutAxis& rhs) const {
+    return name_ == rhs.name_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const LayoutAxis& l) {
+    os << l.name();
+    return os;
+  }
+
+ private:
+  static const LayoutAxis UPPER_CASE[];
+  static const LayoutAxis LOWER_CASE[];
+  LayoutAxis(const LayoutAxis&);
+  LayoutAxis& operator=(const LayoutAxis&);
+  explicit LayoutAxis(const char name) : name_(name) {}
+
+  const char name_;
+};
+
+class Layout;
+// Internal node container Buffer
+class LayoutNode : public Node {
+ public:
+  /*! \brief string representation of layout */
+  std::string name;
+  /*! \brief specify each axis of the layout,
+   *   in which the variable name is the name of the axis.
+   *   The IterVar's extent indicates the size of the axis,
+   *   it is a variable for a primal axis, but a constant for a subordinate axis.
+   */
+  Array<IterVar> axes;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("axes", &axes);
+  }
+
+  TVM_DLL static Layout make(const std::string& layout);
+
+  static constexpr const char* _type_key = "Layout";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
+};
+
+/*!
+ * \brief Layout is to describe how data is organized within an N-dimention tensor.
+ *  It is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a primal axis and
+ *  the corresponding lower case with factor size indicates the subordinate axis.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here subordinate axis channel_block=16 is the factor size of the primal axis C (channel).
+ */
+class Layout : public NodeRef {
+ public:
+  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \brief default constructor */
+  Layout() = default;
+
+  explicit Layout(const Array<IterVar>& axes);
+
+  /*! \brief construct from a string */
+  Layout(const char* name) : Layout(std::string(name)) {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param name input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& name); // NOLINT(*)
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  const LayoutNode* operator->() const {
+    return static_cast<const LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  LayoutNode* operator->() {
+    return static_cast<LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Return an undefined layout.
+   * \return a (global) undefined layout.
+   */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Returns a sub-layout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout SubLayout(size_t pos, size_t len) const;
+
+  /*!
+   * \brief Split \p axis by \p size and put the sub-axis to position \p target_pos.
+   * \param axis The source axis to be split. It must be a primal-axis;
+   * \param target_pos The target position of the newly split subordinate-axis.
+   * \param factor size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout Split(const LayoutAxis &axis, size_t target_pos, int32_t factor) const;
+
+
+  /*! \return number of dimensions */
+  inline size_t ndim() const {
+    if (!defined()) return 0;
+    return operator->()->axes.size();
+  }
+
+  /*! \return number of super dimensions */
+  inline size_t ndim_primal() const {
+    if (!defined()) return 0;
+    size_t ct = 0;
+    for (auto x : operator->()->axes) {
+      if (LayoutAxis::Get(x).IsPrimal()) {
+        ct++;
+      }
+    }
+    return ct;
+  }
+
+  /*!
+   * \brief return the index of the input axis.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param axis the input axis.
+   * \return the index or -1 if not found.
+   */
+  inline int32_t IndexOf(const LayoutAxis& axis) const {
+    if (!this->defined()) return -1;
+    const auto axes = operator->()->axes;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      if (axes[i]->var.get()->name_hint == axis.name()) return static_cast<int32_t>(i);
+    }
+    return -1;
+  }
+
+  /*!
+   * \brief Get the factor size of the subordinate axis.
+   * \param axis the input primal-axis or subordinate-axis.
+   * \return the size of the subordinate-axis of \p axis (if \p axis is a primal-axis),
+   *         or the size of \p axis itself (if \p axis is a subordinate-axis).
+   *         Return -1 if \p axis is not in the layout the layout is undefined.
+   */
+  int32_t FactorOf(const LayoutAxis& axis) const;
+
+  /*!
+   * \brief Whether the layout contains an axis.
+   * \param axis axis to be checked.
+   * \return Whether the layout contains the axis.
+   */
+  bool Contains(const LayoutAxis& axis) const {
+    if (!defined()) return false;
+    for (const IterVar var : operator->()->axes) {
+      if (var->var.get()->name_hint == axis.name()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const LayoutAxis& operator[](int32_t i) const {
+    CHECK(defined()) << "Try to access axis from an undefined layout.";
+    int32_t index = i < 0 ? static_cast<int32_t>(ndim() + i) : i;
+    CHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
+    const IterVar axis = operator->()->axes[index];
+    return LayoutAxis::Get(axis);
+  }
+
+  /*! \return the string description of the layout */
+  inline std::string name() const {
+    if (!defined()) return "__undef__";
+    return operator->()->name;
+  }
+
+  /*!
+   * \brief Whether the two layouts are equal.
+   * \param rhs Another layout.
+   * \return whether the two layouts are equal.
+   */
+  inline bool Equals(const Layout &rhs) const {
+    return name() == rhs.name();
+  }
+
+  /*!
+   * \brief allow output string of layout to ostream
+   * \param os the output stream
+   * \param l the layout
+   * \return the ostream
+   */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name();
+    return os;
+  }
+
+  using ContainerType = LayoutNode;
+};
+
+class BijectiveLayout;
+// Internal node container BijectiveLayout
+class BijectiveLayoutNode : public Node {
+ public:
+  /*! \brief Describes how source axes can be mapped to the destination axes,
+   *   e.g., [i0 / 16, i1, i0 % 16] can describe NC -> NC16n
+   */
+  Array<Expr> forward_rule;
+  /*! \brief Describes how destination axes can be mapped to the source axes */
+  Array<Expr> backward_rule;
+
+  /*! \brief The source layout */
+  Layout src_layout;
+  /*! \brief The destination layout */
+  Layout dst_layout;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("src_layout", &src_layout);
+    v->Visit("dst_layout", &dst_layout);
+    v->Visit("forward_rule", &forward_rule);
+    v->Visit("backward_rule", &backward_rule);
+  }
+
+  static constexpr const char* _type_key = "BijectiveLayout";
+  TVM_DECLARE_NODE_TYPE_INFO(BijectiveLayoutNode, Node);
+
+  TVM_DLL static BijectiveLayout make(const Layout& src_layout,
+                                      const Layout& dst_layout);
+};
+
+/*! \brief Bijective function mapping for data layout transformation.
+ *   Given two Layout, BijectiveLayout build and store the mapping rules,
+ *   provides API to transform N-dimention tensor from the source indices (i0, i1, …, im)
+ *   to the destination indices (j0, j1, … jm).
+ */
+class BijectiveLayout : public NodeRef {
+ public:
+  BijectiveLayout() = default;
+  explicit BijectiveLayout(NodePtr<Node> n) : NodeRef(n) {}
+
+  // Given the source shape, infer the destination shape.
+  TVM_DLL Array<Expr> ForwardShape(const Array<Expr>& shape) const;
+  // Given the destination shape, recover the source shape.
+  TVM_DLL Array<Expr> BackwardShape(const Array<Expr>& dst_shape) const;
+  // Given the destination indices, infer the destination indices.
+  TVM_DLL Array<Expr> ForwardIndex(const Array<Expr>& index) const;
+  // Given the destination indices, recover the source indices.
+  TVM_DLL Array<Expr> BackwardIndex(const Array<Expr>& dst_index) const;
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const BijectiveLayoutNode* operator->() const;
+
+  /*! \brief specify container node */
+  using ContainerType = BijectiveLayoutNode;
+};
+
+inline const BijectiveLayoutNode* BijectiveLayout::operator->() const {
+  return static_cast<const BijectiveLayoutNode*>(node_.get());
+}
+
+}  // namespace tvm
+
+#endif  // TVM_DATA_LAYOUT_H_
diff --git a/include/tvm/ir_operator.h b/include/tvm/expr_operator.h
similarity index 99%
rename from include/tvm/ir_operator.h
rename to include/tvm/expr_operator.h
index c2cdc5e7a923..c4d2d555f3a3 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/expr_operator.h
@@ -1,13 +1,13 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file tvm/ir_operator.h
+ * \file tvm/expr_operator.h
  * \brief Common operators defined for Expr.
  *
  * \note Most of the operator defined here perform simple constant folding
  *   when the type is int32 or int64 for simplifying the index expressions.
  */
-#ifndef TVM_IR_OPERATOR_H_
-#define TVM_IR_OPERATOR_H_
+#ifndef TVM_EXPR_OPERATOR_H_
+#define TVM_EXPR_OPERATOR_H_
 
 #include <algorithm>
 #include <type_traits>
@@ -617,4 +617,4 @@ TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator&&);
 TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator||);
 
 }  // namespace tvm
-#endif  // TVM_IR_OPERATOR_H_
+#endif  // TVM_EXPR_OPERATOR_H_
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 3ef955e834d0..0f05c98e0722 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -17,6 +17,7 @@
 namespace tvm {
 namespace ir {
 
+using HalideIR::Internal::BaseExprNode;
 using HalideIR::Internal::ExprNode;
 using HalideIR::Internal::StmtNode;
 using HalideIR::Internal::IRNodeType;
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 3509b133cfc3..eafce72375cf 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -10,7 +10,7 @@
 #include <vector>
 #include <unordered_map>
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tensor.h"
 #include "schedule.h"
 #include "arithmetic.h"
@@ -184,22 +184,45 @@ class PlaceholderOpNode : public OperationNode {
 
 /*!
  * \brief A Compute op that compute a tensor on certain domain.
+ * This is the base class for ComputeOp (operating on a scalar at a time) and
+ * TensorComputeOp (operating on a TensorSlice at a time)
  */
-class TVM_DLL ComputeOpNode : public OperationNode {
+class TVM_DLL BaseComputeOpNode : public OperationNode {
  public:
   /*! \brief IterVar on each axis */
   Array<IterVar> axis;
   /*! \brief IterVar on each reduction axis, if the body is a Reduce */
   Array<IterVar> reduce_axis;
+  // override functions
+  Array<IterVar> root_iter_vars() const final;
+  Array<Expr> output_shape(size_t idx) const final;
+  void GatherBound(
+          const Operation& self,
+          const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+          std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+          const Stage& stage,
+          const std::unordered_map<IterVar, Range>& realize_map,
+          const Stmt& body) const final;
+  virtual size_t num_schedulable_dims() const = 0;
+
+  static constexpr const char* _type_key = "BaseComputeOp";
+  TVM_DECLARE_BASE_NODE_INFO(BaseComputeOpNode, OperationNode);
+};
+
+
+/*!
+ * \brief A Compute op that compute a tensor on certain domain.
+ */
+class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
+ public:
   /*! \brief the compute expression */
   Array<Expr> body;
   /*! \brief constructor */
   ComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   Type output_dtype(size_t i) const final;
-  Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -208,18 +231,11 @@ class TVM_DLL ComputeOpNode : public OperationNode {
       const Operation& self,
       const std::unordered_map<const Variable*, IntSet>& dom_map,
       std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(
-      const Operation& self,
-      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-      std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(
-      const Stage& stage,
-      const std::unordered_map<IterVar, Range>& realize_map,
-      const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
+  size_t num_schedulable_dims() const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
@@ -236,18 +252,14 @@ class TVM_DLL ComputeOpNode : public OperationNode {
                         Array<Expr> body);
 
   static constexpr const char* _type_key = "ComputeOp";
-  TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, OperationNode);
+  TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, BaseComputeOpNode);
 };
 
 /*!
  * \brief A TenorCompute op that compute a tensor with an tensor intrinsic.
  */
-class TensorComputeOpNode : public OperationNode {
+class TensorComputeOpNode : public BaseComputeOpNode {
  public:
-  /*! \brief IterVar on each axis */
-  Array<IterVar> axis;
-  /*! \brief IterVar on each reduction axis, if the intrin will use the reduce axis */
-  Array<IterVar> reduce_axis;
   /*! \brief number of axes that can be scheduled */
   int schedulable_ndim;
   /*! \brief TensorIntrin used to compute */
@@ -260,9 +272,7 @@ class TensorComputeOpNode : public OperationNode {
   TensorComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   Type output_dtype(size_t i) const final;
-  Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -271,18 +281,11 @@ class TensorComputeOpNode : public OperationNode {
       const Operation& self,
       const std::unordered_map<const Variable*, IntSet>& dom_map,
       std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(
-      const Operation& self,
-      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-      std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(
-      const Stage& stage,
-      const std::unordered_map<IterVar, Range>& realize_map,
-      const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
+  size_t num_schedulable_dims() const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
@@ -304,7 +307,7 @@ class TensorComputeOpNode : public OperationNode {
                         Array<Region> regions);
 
   static constexpr const char* _type_key = "TensorComputeOp";
-  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, OperationNode);
+  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, BaseComputeOpNode);
 };
 
 /*!
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fa27a4d437d2..5382017d8c1c 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -96,6 +96,56 @@ struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
   }
 };  // struct InitOpAttrs
 
+/*! \brief Attributes used in arange operators */
+struct ArangeAttrs : public tvm::AttrsNode<ArangeAttrs> {
+  tvm::Expr start;
+  tvm::Expr stop;
+  tvm::Expr step;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(ArangeAttrs, "relay.attrs.ArangeAttrs") {
+    TVM_ATTR_FIELD(start).set_default(make_const(Float(32), 0))
+        .describe("Start of interval. The interval includes this value.");
+    TVM_ATTR_FIELD(stop)
+        .describe("Stop of interval. The interval does not include this value.");
+    TVM_ATTR_FIELD(step).set_default(make_const(Float(32), 1))
+        .describe("Spacing between values.");
+    TVM_ATTR_FIELD(dtype).set_default(NullValue<DataType>())
+        .describe("Target data type.");
+  }
+};  // struct ArangeAttrs
+
+/*! \brief Attributes used in stack operators */
+struct StackAttrs : public tvm::AttrsNode<StackAttrs> {
+  Integer axis;
+  TVM_DECLARE_ATTRS(StackAttrs, "relay.attrs.StackAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(0)
+        .describe("The axis in the result array along which the input arrays are stacked.");
+  }
+};  // struct StackAttrs
+
+/*! \brief Attributes used in repeat operators */
+struct RepeatAttrs : public tvm::AttrsNode<RepeatAttrs> {
+  Integer repeats;
+  Integer axis;
+  TVM_DECLARE_ATTRS(RepeatAttrs, "relay.attrs.RepeatAttrs") {
+    TVM_ATTR_FIELD(repeats)
+        .describe("The number of repetitions for each element.");
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Integer>())
+        .describe(" The axis along which to repeat values.");
+  }
+};  // struct RepeatAttrs
+
+/*! \brief Attributes used in tile operators */
+struct TileAttrs : public tvm::AttrsNode<TileAttrs> {
+  Array<Integer> reps;
+  TVM_DECLARE_ATTRS(TileAttrs, "relay.attrs.TileAttrs") {
+    TVM_ATTR_FIELD(reps)
+        .describe("The number of times for repeating the tensor a."
+                  "Each dim sizeof reps must be a positive integer.");
+  }
+};  // struct TileAttrs
+
 /*! \brief Attributes used in squeeze operators */
 struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
   // use axis to make the name numpy compatible.
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index d1a5ea41bc69..20b80f33a2a3 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -58,19 +58,42 @@ struct MultiBoxTransformLocAttrs
   }
 };
 
-/*! \brief Attributes used in non_maximum_suppression operators */
-struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
-  double overlap_threshold;
+/*! \brief Attributes used in get_valid_counts operator */
+struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
+  double score_threshold;
+
+  TVM_DECLARE_ATTRS(GetValidCountsAttrs, "relay.attrs.GetValidCountsAttrs") {
+    TVM_ATTR_FIELD(score_threshold).set_default(0.0)
+      .describe("Lower limit of score for valid bounding boxes.");
+  }
+};
+
+/*! \brief Attributes used in non_maximum_suppression operator */
+struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionAttrs> {
+  int max_output_size;
+  double iou_threshold;
   bool force_suppress;
-  int topk;
-
-  TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
-      TVM_ATTR_FIELD(overlap_threshold).set_default(0.5)
-        .describe("Non-maximum suppression threshold.");
-      TVM_ATTR_FIELD(force_suppress).set_default(false)
-        .describe("Suppress all detections regardless of class_id.");
-      TVM_ATTR_FIELD(topk).set_default(-1)
-        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+  int top_k;
+  int id_index;
+  bool return_indices;
+  bool invalid_to_bottom;
+
+  TVM_DECLARE_ATTRS(NonMaximumSuppressionAttrs, "relay.attrs.NonMaximumSuppressionAttrs") {
+    TVM_ATTR_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
+    TVM_ATTR_FIELD(iou_threshold).set_default(0.5)
+      .describe("Non-maximum suppression threshold.");
+    TVM_ATTR_FIELD(force_suppress).set_default(false)
+      .describe("Suppress all detections regardless of class_id.");
+    TVM_ATTR_FIELD(top_k).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    TVM_ATTR_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
+    TVM_ATTR_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
@@ -98,6 +121,55 @@ struct ROIAlignAttrs : public tvm::AttrsNode<ROIAlignAttrs> {
   }
 };
 
+/*! \brief Attributes used in yolo reorg operators */
+struct YoloReorgAttrs : public tvm::AttrsNode<YoloReorgAttrs> {
+  Integer stride;
+
+  TVM_DECLARE_ATTRS(YoloReorgAttrs, "relay.attrs.YoloReorgAttrs") {
+    TVM_ATTR_FIELD(stride)
+      .set_default(1)
+      .describe("Stride value for yolo reorg");
+  }
+};
+
+/*! \brief Attributes used in proposal operators */
+struct ProposalAttrs : public tvm::AttrsNode<ProposalAttrs> {
+  Array<IndexExpr> scales;
+  Array<IndexExpr> ratios;
+  int feature_stride;
+  double threshold;
+  int rpn_pre_nms_top_n;
+  int rpn_post_nms_top_n;
+  int rpn_min_size;
+  bool iou_loss;
+
+  TVM_DECLARE_ATTRS(ProposalAttrs, "relay.attrs.ProposalAttrs") {
+    TVM_ATTR_FIELD(scales)
+        .set_default(Array<IndexExpr>({4.0f, 8.0f, 16.0f, 32.0f}))
+        .describe("Used to generate anchor windows by enumerating scales");
+    TVM_ATTR_FIELD(ratios)
+        .set_default(Array<IndexExpr>({0.5f, 1.0f, 2.0f}))
+        .describe("Used to generate anchor windows by enumerating ratios");
+    TVM_ATTR_FIELD(feature_stride)
+        .set_default(16)
+        .describe(
+            "The size of the receptive field each unit in the convolution layer of the rpn,"
+            "for example the product of all stride's prior to this layer.");
+    TVM_ATTR_FIELD(threshold)
+        .set_default(0.7)
+        .describe(
+            "IoU threshold of non-maximum suppresion (suppress boxes with IoU >= this threshold)");
+    TVM_ATTR_FIELD(rpn_pre_nms_top_n)
+        .set_default(6000)
+        .describe("Number of top scoring boxes to apply NMS. -1 to use all boxes");
+    TVM_ATTR_FIELD(rpn_post_nms_top_n)
+        .set_default(300)
+        .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
+    TVM_ATTR_FIELD(rpn_min_size).set_default(16).describe("Minimum height or width in proposal");
+    TVM_ATTR_FIELD(iou_loss).set_default(false).describe("Usage of IoU Loss");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index b9a57c5d4618..06a1aa1ac9ef 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -34,12 +34,7 @@ class ExprNode : public RelayNode {
   /*!
    * \return The checked_type
    */
-  const Type& checked_type() const {
-    CHECK(checked_type_.defined()) << "internal error: the type checker has "
-                                      "not populated the checked_type "
-                                      "field for this node";
-    return this->checked_type_;
-  }
+  const Type& checked_type() const;
   /*!
    * \brief Check if the inferred(checked) type of the Expr
    *  is backed by a TTypeNode and return it.
@@ -235,8 +230,8 @@ class FunctionNode : public ExprNode {
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
-    v->Visit("span", &span);
     v->Visit("attrs", &attrs);
+    v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 
@@ -527,6 +522,14 @@ class TempExprNode : public ExprNode {
 RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
 
 // implementataions
+inline const Type& ExprNode::checked_type() const {
+  CHECK(checked_type_.defined()) << "internal error: the type checker has "
+    "not populated the checked_type "
+    "field for "
+                                 << GetRef<Expr>(this);
+  return this->checked_type_;
+}
+
 template<typename TTypeNode>
 inline const TTypeNode* ExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index b87f9319a3d3..75bfe92ec21c 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -320,7 +320,18 @@ struct StructuralHash {
  *
  * \return expression in A-Normal Form
  */
-Expr ToANF(const Expr& e, const Module& mod);
+Expr ToANormalForm(const Expr& e, const Module& mod);
+
+/*! \brief Remove let binding and directly share via pointer instead.
+ *
+ * It will remove all let binding,
+ * and turn all of the variable bound by let into direct pointer reference.
+ *
+ * \param e the expression.
+ *
+ * \return the expression in graph normal form.
+ */
+Expr ToGraphNormalForm(const Expr& e);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index b493cf6dc8da..1a1a8da67aed 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -43,7 +43,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.5.dev"
+#define TVM_VERSION "0.6.dev"
 
 
 // TVM Runtime is DLPack compatible.
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index e2a447e4235c..2b9674301607 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -178,10 +178,30 @@ class NDArray {
   Container* data_{nullptr};
   // enable internal functions
   friend struct Internal;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
   friend class TVMRetValue;
   friend class TVMArgsSetter;
 };
 
+/*!
+ * \brief The type trait indicates subclass of TVM's NDArray.
+ *  For irrelavant classes, code = -1.
+ *  For TVM NDArray itself, code = 0.
+ *  All subclasses of NDArray should override code > 0.
+ */
+template<typename T>
+struct array_type_info {
+  /*! \brief the value of the traits */
+  static const int code = -1;
+};
+
+// Overrides the type trait for tvm's NDArray.
+template<>
+struct array_type_info<NDArray> {
+  static const int code = 0;
+};
+
 /*!
  * \brief Save a DLTensor to stream
  * \param strm The outpu stream
@@ -196,7 +216,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
  *    the pointer to the NDArrayContainer can be directly
  *    interpreted as a DLTensor*
  *
- * \note: do not use this function directly, use NDArray.
+ * \note do not use this function directly, use NDArray.
  */
 class NDArray::Container {
  public:
@@ -228,16 +248,19 @@ class NDArray::Container {
 
  protected:
   friend class NDArray;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
+  friend class TVMRetValue;
   friend class RPCWrappedFunc;
   /*!
    * \brief Type flag used to indicate subclass.
    *  Default value 0 means normal NDArray::Conatainer.
    *
    *  We can extend a more specialized NDArray::Container
-   *  and use the array_type_index_ to indicate
+   *  and use the array_type_code_ to indicate
    *  the specific array subclass.
    */
-  uint32_t array_type_index_{0};
+  int32_t array_type_code_{0};
   /*! \brief The internal reference counter */
   std::atomic<int> ref_counter_{0};
   /*!
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index a3b4a1696bf0..1398da0d748b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -362,7 +362,7 @@ inline std::string TVMType2String(TVMType t);
  * \tparam T the typename
  */
 template<typename T>
-struct extension_class_info {
+struct extension_type_info {
   static const int code = 0;
 };
 
@@ -455,6 +455,15 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
     return value_.v_ctx;
   }
+  template<typename TNDArray,
+           typename = typename std::enable_if<
+           std::is_base_of<NDArray, TNDArray>::value>::type>
+  TNDArray AsNDArray() const {
+    if (type_code_ == kNull) return TNDArray(nullptr);
+    auto *container = static_cast<NDArray::Container*>(value_.v_handle);
+    CHECK_EQ(container->array_type_code_, array_type_info<TNDArray>::code);
+    return TNDArray(container);
+  }
   template<typename TExtension>
   const TExtension& AsExtension() const {
     CHECK_LT(type_code_, kExtEnd);
@@ -561,7 +570,7 @@ class TVMArgValue : public TVMPODValue_ {
   inline TNodeRef AsNodeRef() const;
   template<typename T,
            typename = typename std::enable_if<
-             std::is_class<T>::value>::type>
+           std::is_class<T>::value>::type>
   inline operator T() const;
   template<typename TNodeRef,
            typename = typename std::enable_if<
@@ -727,10 +736,10 @@ class TVMRetValue : public TVMPODValue_ {
   }
   template<typename T,
            typename = typename std::enable_if<
-             extension_class_info<T>::code != 0>::type>
+             extension_type_info<T>::code != 0>::type>
   TVMRetValue& operator=(const T& other) {
     this->SwitchToClass<T>(
-        extension_class_info<T>::code, other);
+        extension_type_info<T>::code, other);
     return *this;
   }
   /*!
@@ -1094,7 +1103,7 @@ class TVMArgsSetter {
   // extension
   template<typename T,
            typename = typename std::enable_if<
-             extension_class_info<T>::code != 0>::type>
+             extension_type_info<T>::code != 0>::type>
   inline void operator()(size_t i, const T& value) const;
   // NodeRef related extenstions: in tvm/packed_func_ext.h
   inline void operator()(size_t i, const NodeRef& other) const;  // NOLINT(*)
@@ -1212,40 +1221,53 @@ inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
 
 // extension and node type handling
 namespace detail {
-template<typename T, typename TSrc, bool is_ext>
+template<typename T, typename TSrc, bool is_ext, bool is_nd>
 struct TVMValueCast {
   static T Apply(const TSrc* self) {
+    static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
     return self->template AsNodeRef<T>();
   }
 };
 
 template<typename T, typename TSrc>
-struct TVMValueCast<T, TSrc, true> {
+struct TVMValueCast<T, TSrc, true, false> {
   static T Apply(const TSrc* self) {
     return self->template AsExtension<T>();
   }
 };
+
+template<typename T, typename TSrc>
+struct TVMValueCast<T, TSrc, false, true> {
+  static T Apply(const TSrc* self) {
+    return self->template AsNDArray<T>();
+  }
+};
+
 }  // namespace detail
 
 template<typename T, typename>
 inline TVMArgValue::operator T() const {
   return detail::
-      TVMValueCast<T, TVMArgValue, extension_class_info<T>::code != 0>
+      TVMValueCast<T, TVMArgValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
       ::Apply(this);
 }
 
 template<typename T, typename>
 inline TVMRetValue::operator T() const {
   return detail::
-      TVMValueCast<T, TVMRetValue, extension_class_info<T>::code != 0>
+      TVMValueCast<T, TVMRetValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
       ::Apply(this);
 }
 
 template<typename T, typename>
 inline void TVMArgsSetter::operator()(size_t i, const T& value) const {
-  static_assert(extension_class_info<T>::code != 0,
+  static_assert(extension_type_info<T>::code != 0,
                 "Need to have extesion code");
-  type_codes_[i] = extension_class_info<T>::code;
+  type_codes_[i] = extension_type_info<T>::code;
   values_[i].v_handle = const_cast<T*>(&value);
 }
 
@@ -1262,9 +1284,9 @@ struct ExtTypeInfo {
 
 template<typename T>
 inline ExtTypeVTable* ExtTypeVTable::Register_() {
-  const int code = extension_class_info<T>::code;
+  const int code = extension_type_info<T>::code;
   static_assert(code != 0,
-                "require extension_class_info traits to be declared with non-zero code");
+                "require extension_type_info traits to be declared with non-zero code");
   ExtTypeVTable vt;
   vt.clone = ExtTypeInfo<T>::clone;
   vt.destroy = ExtTypeInfo<T>::destroy;
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 9466056a1282..a53a76f4df2e 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -133,7 +133,7 @@ class Registry {
 /*!
  * \brief Macro to register extension type.
  *  This must be registered in a cc file
- *  after the trait extension_class_info is defined.
+ *  after the trait extension_type_info is defined.
  */
 #define TVM_REGISTER_EXT_TYPE(T)                                 \
   TVM_STR_CONCAT(TVM_TYPE_REG_VAR_DEF, __COUNTER__) =            \
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 16f7363a9e73..87ced8b3cb2a 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -14,7 +14,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "arithmetic.h"
 
 namespace tvm {
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 645c68357f13..5f81cb52fa31 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -8,7 +8,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tensor.h"
 #include "operation.h"
 #include "packed_func_ext.h"
diff --git a/nnvm/include/nnvm/compiler/packed_func_ext.h b/nnvm/include/nnvm/compiler/packed_func_ext.h
index e289fd4efa59..a79574fa0879 100644
--- a/nnvm/include/nnvm/compiler/packed_func_ext.h
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
@@ -40,17 +40,17 @@ namespace tvm {
 namespace runtime {
 
 template<>
-struct extension_class_info<nnvm::Symbol> {
+struct extension_type_info<nnvm::Symbol> {
   static const int code = 16;
 };
 
 template<>
-struct extension_class_info<nnvm::Graph> {
+struct extension_type_info<nnvm::Graph> {
   static const int code = 17;
 };
 
 template<>
-struct extension_class_info<nnvm::compiler::AttrDict> {
+struct extension_type_info<nnvm::compiler::AttrDict> {
   static const int code = 18;
 };
 
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 143a9548f18a..578f928c5b9f 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -443,17 +443,30 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
   }
 };
 
-struct NMSParam : public dmlc::Parameter<NMSParam> {
-  float nms_threshold;
+struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppressionParam> {
+  bool return_indices;
+  float iou_threshold;
   bool force_suppress;
-  int nms_topk;
-  DMLC_DECLARE_PARAMETER(NMSParam) {
-    DMLC_DECLARE_FIELD(nms_threshold).set_default(0.5)
+  int top_k;
+  int id_index;
+  int max_output_size;
+  bool invalid_to_bottom;
+  DMLC_DECLARE_PARAMETER(NonMaximumSuppressionParam) {
+    DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
+    DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
-    .describe("Suppress all detections regardless of class_id.");
-    DMLC_DECLARE_FIELD(nms_topk).set_default(-1)
-    .describe("Keep maximum top k detections before nms, -1 for no limit.");
+      .describe("Suppress all detections regardless of class_id.");
+    DMLC_DECLARE_FIELD(top_k).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    DMLC_DECLARE_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
+    DMLC_DECLARE_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
diff --git a/nnvm/python/nnvm/_base.py b/nnvm/python/nnvm/_base.py
index 29390a2201bf..dd797ba4489f 100644
--- a/nnvm/python/nnvm/_base.py
+++ b/nnvm/python/nnvm/_base.py
@@ -31,7 +31,7 @@
 
 class NNVMError(Exception):
     """Error that will be throwed by all nnvm functions"""
-    pass
+
 
 def _load_lib():
     """Load libary by searching possible path."""
diff --git a/nnvm/python/nnvm/attribute.py b/nnvm/python/nnvm/attribute.py
index a023b9cd88df..4a08bb622ed5 100644
--- a/nnvm/python/nnvm/attribute.py
+++ b/nnvm/python/nnvm/attribute.py
@@ -42,8 +42,7 @@ def get(self, attr):
             if attr:
                 ret.update(attr)
             return ret
-        else:
-            return attr
+        return attr
 
     def __enter__(self):
         # pylint: disable=protected-access
diff --git a/nnvm/python/nnvm/compiler/compile_engine.py b/nnvm/python/nnvm/compiler/compile_engine.py
index 289f09deb280..e6158fb611fe 100644
--- a/nnvm/python/nnvm/compiler/compile_engine.py
+++ b/nnvm/python/nnvm/compiler/compile_engine.py
@@ -23,13 +23,11 @@ def graph(self):
 @tvm.register_node
 class GraphCacheEntry(tvm.node.NodeBase):
     """CacheEntry of compilation into a TVM Function"""
-    pass
 
 
 @tvm.register_node
 class GraphFunc(tvm.node.NodeBase):
     """Compiled result of a graph into a TVM Function"""
-    pass
 
 
 class Engine(object):
diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py
index 3ce6c4b53239..2f1f0350d71b 100644
--- a/nnvm/python/nnvm/compiler/graph_attr.py
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -39,6 +39,7 @@ def set_shape_inputs(g, shape):
     "uint16": 8,
     "uint32": 9,
     "uint64": 10,
+    "bool": 11,
 }
 
 TCODE_TO_DTYPE = {
@@ -54,6 +55,7 @@ def set_shape_inputs(g, shape):
     8: "uint16",
     9: "uint32",
     10: "uint64",
+    11: "bool",
 }
 
 def set_dtype_inputs(g, dtype):
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
index 2450af628a90..8211971a8c3c 100755
--- a/nnvm/python/nnvm/frontend/caffe2.py
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -73,9 +73,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        else:
-            raise NotImplementedError('{} not implemented'.format(
-                cls.__name__))
+        raise NotImplementedError('{} not implemented'.format(
+            cls.__name__))
 
 
 _caffe2_internal_args = {
@@ -175,11 +174,10 @@ def _get_axis_from_order_str(order):
             order = order if isinstance(order, str) else order.decode('UTF-8')
             if order == 'NCHW':
                 return 1
-            elif order == 'NHWC':
+            if order == 'NHWC':
                 return 3
-            else:
-                raise RuntimeError(
-                    "Unsupported storage order: {} in caffe2".format(order))
+            raise RuntimeError(
+                "Unsupported storage order: {} in caffe2".format(order))
 
         return AttrCvt(
             op_name='concatenate',
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index bc544243bd92..77285efe7a76 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -98,33 +98,33 @@ def ActivationParams(op, insym, symtab):
     par = getattr(op, whichActivation)
     if whichActivation == 'linear':
         return _sym.__add_scalar__(_sym.__mul_scalar__(insym, scalar=par.alpha), scalar=par.beta)
-    elif whichActivation == 'ReLU':
+    if whichActivation == 'ReLU':
         return _sym.relu(insym)
-    elif whichActivation == 'leakyReLU':
+    if whichActivation == 'leakyReLU':
         return _sym.leaky_relu(insym, alpha=par.alpha)
-    elif whichActivation == 'thresholdedReLU':
+    if whichActivation == 'thresholdedReLU':
         alpha_tensor = _sym.full_like(insym, fill_value=float(par.alpha))
         return _sym.elemwise_mul(insym, _sym.greater(insym, alpha_tensor))
-    elif whichActivation == 'PReLU':
+    if whichActivation == 'PReLU':
         return _sym.prelu(insym, alpha=par.alpha)
-    elif whichActivation == 'tanh':
+    if whichActivation == 'tanh':
         return _sym.tanh(insym)
-    elif whichActivation == 'scaledTanh':
+    if whichActivation == 'scaledTanh':
         return _sym.__mul_scalar__(_sym.tanh(_sym.__mul_scalar__(
             insym, scalar=par.beta)), scalar=par.alpha)
-    elif whichActivation == 'sigmoid':
+    if whichActivation == 'sigmoid':
         return _sym.sigmoid(insym)
-    elif whichActivation == 'sigmoidHard':
+    if whichActivation == 'sigmoidHard':
         transformX = (par.alpha * insym) + par.beta
         return _sym.clip(transformX, a_min=0, a_max=1)
-    elif whichActivation == 'ELU':
+    if whichActivation == 'ELU':
         return _sym.__mul_scalar__(_sym.__add_scalar__(
             _sym.exp(insym), scalar=-1), scalar=par.alpha)
-    elif whichActivation == 'softsign':
+    if whichActivation == 'softsign':
         return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
-    elif whichActivation == 'softplus':
+    if whichActivation == 'softplus':
         return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
-    elif whichActivation == 'parametricSoftplus':
+    if whichActivation == 'parametricSoftplus':
         alpha = list(par.alpha.floatValue)
         beta = list(par.alpha.floatValue)
         if len(alpha) == 1:
@@ -136,8 +136,7 @@ def ActivationParams(op, insym, symtab):
         betasym = symtab.new_const(beta)
         return _sym.broadcast_mul(_sym.log(_sym.broadcast_add(
             _sym.exp(insym), betasym)), alphasym)
-    else:
-        raise NotImplementedError('%s not implemented' % whichActivation)
+    raise NotImplementedError('%s not implemented' % whichActivation)
 
 def ScaleLayerParams(op, insym, symtab):
     """Scale layer params."""
@@ -157,10 +156,9 @@ def PoolingLayerParams(op, insym, symtab):
     if op.globalPooling:
         if op.type == 0:
             return _sym.global_max_pool2d(insym)
-        elif op.type == 1:
+        if op.type == 1:
             return _sym.global_avg_pool2d(insym)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -190,10 +188,9 @@ def PoolingLayerParams(op, insym, symtab):
 
         if op.type == 0:
             return _sym.max_pool2d(insym, **params)
-        elif op.type == 1:
+        if op.type == 1:
             return _sym.avg_pool2d(insym, **params)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
 def SoftmaxLayerParams(op, insym, symtab):
     return _sym.softmax(_sym.flatten(insym))
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 18d07d07ac6b..154c83c90ec6 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -302,18 +302,29 @@ def _darknet_reorg(inputs, attrs):
 
 def _darknet_region(inputs, attrs):
     """Process the region operation."""
-    op_name, new_attrs = 'yolo_region', {}
-    if 'n' in attrs:
-        new_attrs['n'] = attrs.get('n', 1)
-    if 'classes' in attrs:
-        new_attrs['classes'] = attrs.get('classes', 1)
-    if 'coords' in attrs:
-        new_attrs['coords'] = attrs.get('coords', 0)
-    if 'background' in attrs:
-        new_attrs['background'] = attrs.get('background', 0)
-    if 'softmax' in attrs:
-        new_attrs['softmax'] = attrs.get('softmax', 0)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    coords = attrs.get('coords', 0)
+    background = attrs.get('background', 0)
+    softmax = attrs.get('softmax', True)
+    input_shape = attrs.get('shape')
+
+    split_size = classes + coords + 1
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4, 5)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    if not background:
+        split_res2 = _sym.sigmoid(split_res[2])
+    else:
+        split_res2 = split_res[2]
+    if softmax:
+        split_res3 = _sym.softmax(split_res[3], axis=2)
+    concat_list = [split_res0, split_res[1], split_res2, split_res3]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
+
 
 def _darknet_yolo(inputs, attrs):
     """Process the yolo operation."""
@@ -638,6 +649,7 @@ def _get_darknet_attrs(self, layer, layer_num):
             attr.update({'coords' : layer.coords})
             attr.update({'background' : layer.background})
             attr.update({'softmax' : layer.softmax})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
 
         elif LAYERTYPE.YOLO == layer.type:
             attr.update({'n' : layer.n})
@@ -921,8 +933,6 @@ def _make_outlist(self, sym, op_name, layer, layer_num):
             if layer_num != self.net.n-1:
                 self._outs.insert(0, sym)
 
-        return
-
     def from_darknet(self):
         """To convert the darknet symbol to nnvm symbols."""
         for i in range(self.net.n):
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 9dabebc14b90..56758ada5f46 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -47,35 +47,34 @@ def _convert_activation(insym, keras_layer, _):
         beta = keras_layer.beta if hasattr(keras_layer, "beta") else 0
         return _sym.__add_scalar__(_sym.__mul_scalar__(insym, \
             scalar=alpha), scalar=beta)
-    elif act_type == 'softmax':
+    if act_type == 'softmax':
         return _sym.softmax(insym, axis=1)
-    elif act_type == 'sigmoid':
+    if act_type == 'sigmoid':
         return _sym.sigmoid(insym)
-    elif act_type == 'tanh':
+    if act_type == 'tanh':
         return _sym.tanh(insym)
-    elif act_type == 'relu':
+    if act_type == 'relu':
         return _sym.relu(insym)
-    elif act_type == 'softplus':
+    if act_type == 'softplus':
         return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
-    elif act_type == 'elu':
+    if act_type == 'elu':
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
         return _get_elu(insym, alpha)
-    elif act_type == 'selu':
+    if act_type == 'selu':
         # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") \
             else 1.6732632423543772848170429916717
         gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") \
             else 1.0507009873554804934193349852946
         return gamma * _get_elu(insym, alpha)
-    elif act_type == 'relu6':
+    if act_type == 'relu6':
         return _sym.clip(insym, a_min=0, a_max=6)
-    elif act_type == 'softsign':
+    if act_type == 'softsign':
         return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
-    elif act_type == 'hard_sigmoid':
+    if act_type == 'hard_sigmoid':
         transformX = (0.2 * insym) + 0.5
         return _sym.clip(transformX, a_min=0, a_max=1)
-    else:
-        raise TypeError("Unsupported activation type : {}".format(act_type))
+    raise TypeError("Unsupported activation type : {}".format(act_type))
 
 
 def _convert_advanced_activation(insym, keras_layer, symtab):
@@ -84,12 +83,12 @@ def _convert_advanced_activation(insym, keras_layer, symtab):
         if keras_layer.max_value:
             return _sym.clip(insym, a_min=0, a_max=keras_layer.max_value)
         return _sym.relu(insym)
-    elif act_type == 'LeakyReLU':
+    if act_type == 'LeakyReLU':
         return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
-    elif act_type == 'ELU':
+    if act_type == 'ELU':
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
         return _get_elu(insym, alpha)
-    elif act_type == 'PReLU':
+    if act_type == 'PReLU':
         assert hasattr(keras_layer, "alpha"), \
             "alpha required for PReLU."
         _check_data_format(keras_layer)
@@ -97,12 +96,11 @@ def _convert_advanced_activation(insym, keras_layer, symtab):
         return -symtab.new_const(keras_layer.get_weights()[0] \
                                  .transpose(np.roll(range(size), 1))) \
                                  * _sym.relu(-insym) + _sym.relu(insym)
-    elif act_type == 'ThresholdedReLU':
+    if act_type == 'ThresholdedReLU':
         theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
         theta_tensor = _sym.full_like(insym[0], fill_value=float(theta))
         return _sym.elemwise_mul(insym[0], _sym.greater(insym[0], theta_tensor, out_type="float32"))
-    else:
-        raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
 
 
 def _convert_merge(insym, keras_layer, _):
@@ -280,31 +278,29 @@ def _convert_pooling(insym, keras_layer, symtab):
     # global pool in keras = global pool + flatten in nnvm
     if pool_type == 'GlobalMaxPooling2D':
         return _convert_flatten(_sym.global_max_pool2d(insym), keras_layer, symtab)
-    elif pool_type == 'GlobalAveragePooling2D':
+    if pool_type == 'GlobalAveragePooling2D':
         return _convert_flatten(_sym.global_avg_pool2d(insym), keras_layer, symtab)
+    pool_h, pool_w = keras_layer.pool_size
+    stride_h, stride_w = keras_layer.strides
+    params = {'pool_size': [pool_h, pool_w],
+              'strides': [stride_h, stride_w],
+              'padding': [0, 0]}
+    if keras_layer.padding == 'valid':
+        pass
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+        params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        pool_h, pool_w = keras_layer.pool_size
-        stride_h, stride_w = keras_layer.strides
-        params = {'pool_size': [pool_h, pool_w],
-                  'strides': [stride_h, stride_w],
-                  'padding': [0, 0]}
-        if keras_layer.padding == 'valid':
-            pass
-        elif keras_layer.padding == 'same':
-            in_h = keras_layer.input_shape[1]
-            in_w = keras_layer.input_shape[2]
-            pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
-            pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
-        else:
-            raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
-        if pool_type == 'MaxPooling2D':
-            return _sym.max_pool2d(insym, **params)
-        elif pool_type == 'AveragePooling2D':
-            # TODO: in keras, padded zeros are not calculated
-            return _sym.avg_pool2d(insym, **params)
-        else:
-            raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    if pool_type == 'MaxPooling2D':
+        return _sym.max_pool2d(insym, **params)
+    if pool_type == 'AveragePooling2D':
+        # TODO: in keras, padded zeros are not calculated
+        return _sym.avg_pool2d(insym, **params)
+    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
 
 
 def _convert_upsample(insym, keras_layer, _):
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 2cf701ea9040..c9f6777e4898 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -189,6 +189,19 @@ def _reshape(inputs, attrs):
     new_attrs['shape'] = _required_attr(attrs, 'shape')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
+def _slice(inputs, attrs):
+    begin = attrs.get('begin', None)
+    end = attrs.get('end', None)
+    stride = attrs.get('step', None)
+    if begin is None or end is None:
+        raise RuntimeError('begin and end are required params')
+    if 'None' in begin or 'None' in end:
+        raise RuntimeError('None in begin or end not supported yet...')
+    new_attrs = {'begin': begin, 'end': end}
+    if stride is not None:
+        new_attrs['stride'] = stride
+    return _get_nnvm_op('strided_slice')(inputs[0], **new_attrs)
+
 def _split(inputs, attrs):
     op_name, new_attrs = 'split', {}
     axis = attrs.get('axis', 1)
@@ -232,11 +245,11 @@ def _contrib_multibox_detection(inputs, attrs):
         if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
     nms_topk = attrs.get('nms_topk') or -1
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
-    new_attrs1 = {'nms_threshold': float(nms_threshold), 'force_suppress': force_suppress,
-                  'nms_topk': int(nms_topk)}
+    new_attrs1 = {'return_indices': False, 'iou_threshold': float(nms_threshold),
+                  'force_suppress': force_suppress, 'top_k': int(nms_topk)}
     data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
                                                                inputs[2], **new_attrs0)
-    return _get_nnvm_op('nms')(data, valid_count, **new_attrs1)
+    return _get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
 
 def _elemwise_sum(inputs, _):
     new_attrs = {'num_args':len(inputs)}
@@ -281,10 +294,15 @@ def _symbol_ring_buffer(inputs, attrs):
 def _copy(inputs, _):
     return _get_nnvm_op('copy')(inputs[0], **{})
 
-
 def _argmax(inputs, attrs):
     return _get_nnvm_op('argmax')(*inputs, **attrs)
 
+def _minimum(inputs, attrs):
+    return _get_nnvm_op('broadcast_min')(*inputs, **attrs)
+
+def _maximum(inputs, attrs):
+    return _get_nnvm_op('broadcast_max')(*inputs, **attrs)
+
 def _ones(_, attrs):
     op_name = 'ones'
     return _get_nnvm_op(op_name)(**attrs)
@@ -317,7 +335,7 @@ def _argmin(inputs, attrs):
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
                   'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
                   'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
-                  'reshape_like']
+                  'reshape_like', 'where']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
@@ -329,6 +347,8 @@ def _argmin(inputs, attrs):
     '_rminus_scalar': _rename('__rsub_scalar__'),
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
+    '_minimum'      : _minimum,
+    '_maximum'      : _maximum,
     '_ones'         : _ones,
     '_zeros'        : _zeros,
     'argmax'        : _argmax,
@@ -349,6 +369,7 @@ def _argmin(inputs, attrs):
     'Pooling'       : _pooling,
     'Pooling_v1'    : _pooling,
     'Reshape'       : _reshape,
+    'slice'         : _slice,
     'SliceChannel'  : _split,
     'split'         : _split,
     'Softmax'       : _rename('softmax'),
@@ -438,7 +459,7 @@ def _topo_sort(symbol):
         if childs is None:
             dep_cnts[name] = 0
         else:
-            dep_cnts[name] = len(set([c.attr('name') for c in childs]))
+            dep_cnts[name] = len({c.attr('name') for c in childs})
             for child in childs:
                 child_name = child.attr('name')
                 if child_name not in deps:
diff --git a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
index 4dfc366d0b6f..ff74016cde06 100644
--- a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
+++ b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
@@ -9,8 +9,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 3099911b86d0..777ab8a80adf 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -68,8 +68,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
     return _impl
 
 def _dimension_constraint():
@@ -433,8 +432,7 @@ def _impl(inputs, attr, params):
                     op_name="reshape",
                     extras={'shape':tuple(params_new[0].asnumpy().flatten())},
                     ignores=['Tshape'])(inputs, attr)
-            else:
-                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -886,6 +884,11 @@ def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
 
     return _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis)
 
+def _logical(name):
+    def _impl(inputs, attr, params):
+        return AttrCvt(op_name=name)(inputs, attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -948,6 +951,9 @@ def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
     'Mean'                              : _mean(),
+    'LogicalAnd'                        : _logical('logical_and'),
+    'LogicalOr'                         : _logical('logical_or'),
+    'LogicalNot'                        : _logical('logical_not'),
     'Less'                              : _broadcast('less'),
     'Greater'                           : _broadcast('greater'),
     'LessEqual'                         : _broadcast('less_equal'),
@@ -1411,7 +1417,7 @@ def _parse_param(self, key, value, name):
             self._nodes[name] = _sym.Variable(name=name,
                                               shape=self._params[name].shape)
         else:
-            if key != 'dtype' and key != '_output_shapes' and key != '_class':
+            if key not in ('dtype', '_output_shapes', '_class'):
                 raise NotImplementedError \
                     ("Other attributes for a Const(param) Node {} ? .".format(key))
 
diff --git a/nnvm/python/nnvm/frontend/util/__init__.py b/nnvm/python/nnvm/frontend/util/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/nnvm/python/nnvm/symbol.py b/nnvm/python/nnvm/symbol.py
index 0acacb247a2c..ec8853c3d118 100644
--- a/nnvm/python/nnvm/symbol.py
+++ b/nnvm/python/nnvm/symbol.py
@@ -50,10 +50,9 @@ def __add__(self, other):
         """x.__add__(y) <=> x+y"""
         if isinstance(other, Symbol):
             return __add_symbol__(self, other)
-        elif isinstance(other, _Number):
+        if isinstance(other, _Number):
             return __add_scalar__(self, scalar=other)
-        else:
-            raise TypeError("type %s not supported" % str(type(other)))
+        raise TypeError("type %s not supported" % str(type(other)))
 
     def __radd__(self, other):
         return self.__add__(other)
@@ -64,14 +63,12 @@ def __sub__(self, other):
             return __sub_symbol__(self, other)
         if isinstance(other, _Number):
             return __sub_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
         if isinstance(other, _Number):
             return __rsub_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x*y"""
@@ -79,8 +76,7 @@ def __mul__(self, other):
             return __mul_symbol__(self, other)
         if isinstance(other, _Number):
             return __mul_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rmul__(self, other):
         return self.__mul__(other)
@@ -91,28 +87,24 @@ def __div__(self, other):
             return __div_symbol__(self, other)
         if isinstance(other, _Number):
             return __div_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rdiv__(self, other):
         if isinstance(other, _Number):
             return __rdiv_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __lshift__(self, other):
         """x.__lshift__(y) <=> x << y"""
         if isinstance(other, _Number):
             return __lshift_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rshift__(self, other):
         """x.__rshift__(y) <=> x >> y"""
         if isinstance(other, _Number):
             return __rshift_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __truediv__(self, other):
         return self.__div__(other)
@@ -126,14 +118,12 @@ def __pow__(self, other):
             return __pow_symbol__(self, other)
         if isinstance(other, _Number):
             return __pow_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rpow__(self, other):
         if isinstance(other, _Number):
             return __rpow_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
         """x.__neg__() <=> -x"""
@@ -238,12 +228,11 @@ def _get_list_copt(self, option):
         """internal function to get list option"""
         if option == 'all':
             return _ctypes.c_int(0)
-        elif option == 'read_only':
+        if option == 'read_only':
             return _ctypes.c_int(1)
-        elif option == 'aux_state':
+        if option == 'aux_state':
             return _ctypes.c_int(2)
-        else:
-            raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
+        raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
 
     def list_input_variables(self, option='all'):
         """List all the input variables in the symbol.
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
index 7ab4dc0d4c6c..68419b73523b 100644
--- a/nnvm/python/nnvm/testing/check_computation.py
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -8,10 +8,12 @@
 import tvm
 from tvm.contrib import graph_runtime
 from tvm.testing import check_numerical_grads
+from tvm import relay
 
 import nnvm
 from nnvm.compiler import graph_util
 from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from nnvm.to_relay import to_relay
 from .config import ctx_list
 
 def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
@@ -441,6 +443,23 @@ def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
             debug_stage = "running"
             nnvm_res = main_function(**np_inputs)
 
+            try:
+                logging.debug("checking to_relay conversion")
+                inputs = np_inputs_without_head_grads.copy()
+                func, inputs = to_relay(main_graph, shape, dtype, params=inputs)
+                with relay.build_config(opt_level=3):
+                    graph, lib, params = relay.build(func, target=target)
+                m = graph_runtime.create(graph, lib, ctx)
+                m.set_input(**inputs)
+                m.set_input(**params)
+                m.run()
+                for i in range(out_len):
+                    relay_out = m.get_output(i).asnumpy()
+                    tvm.testing.assert_allclose(nnvm_res[i], relay_out, atol=atol, rtol=rtol)
+            except NotImplementedError as err:
+                # the NNVM operator is not supported yet
+                logging.warning(err)
+
             if backward_graph is not None:
                 grad_var_names = [x.attr('name') for x in grad_input_vars]
                 nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
index f14daa1ae656..3faded3b2ece 100644
--- a/nnvm/python/nnvm/testing/inception_v3.py
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -23,11 +23,10 @@ def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None,
 def Pooling(data, kernel, stride, pad, pool_type, name):
     if pool_type == 'max':
         return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
-    elif pool_type == 'avg':
+    if pool_type == 'avg':
         return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
                               count_include_pad=True)
-    else:
-        raise ValueError("Invalid pooling type: " + pool_type)
+    raise ValueError("Invalid pooling type: " + pool_type)
 
 def Inception7A(data,
                 num_1x1,
diff --git a/nnvm/python/nnvm/testing/yolo_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
index 7c600d38db62..3d9f2cacd482 100644
--- a/nnvm/python/nnvm/testing/yolo_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -88,7 +88,6 @@ def _get_yolo_detections(l, im_shape, net_shape, thresh, relative, dets):
         before_correct_dets.append(detection)
     dets.extend(_correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
                                net_shape[0], net_shape[1], relative))
-    return
 
 def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
     data = l['output']
@@ -114,7 +113,6 @@ def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
     _correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
                    net_shape[0], net_shape[1], relative)
     dets.extend(before_correct_dets)
-    return
 
 def fill_network_boxes(net_shape, im_shape,
                        thresh, relative, tvm_out):
diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
index 030fe9991331..7d792116b104 100644
--- a/nnvm/python/nnvm/to_relay.py
+++ b/nnvm/python/nnvm/to_relay.py
@@ -6,7 +6,8 @@
 from tvm import relay, nd
 from tvm.relay import op, expr, var
 from tvm.relay.frontend.common import StrAttrsDict
-from tvm.relay.frontend.nnvm_common import _rename
+from tvm.relay.frontend.nnvm_common import _rename, _binop_scalar, _rbinop_scalar, \
+     _elemwise_sum, _softmax_op, _compare, _reduce
 from .symbol import Symbol
 from .compiler import graph_attr
 from .graph import create as graph_create
@@ -25,11 +26,6 @@ def _dense(children, attrs, odtype='float32'):
     else:
         return dense
 
-def _nn_softmax(children, attrs, odtype='float32'):
-    assert len(children) == 1
-    axis = attrs.get_int('axis', 1)
-    return op.nn.softmax(children[0], axis)
-
 def _conv2d(children, attrs, odtype='float32'):
     use_bias = attrs.get_bool('use_bias', True)
 
@@ -150,84 +146,6 @@ def _transpose(children, attrs, odtype='float32'):
     return op.transpose(children[0], axes=axes)
 
 
-def _add(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.add(left, right)
-
-
-def _subtract(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.subtract(left, right)
-
-
-def _rsubtract(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.subtract(right, left)
-
-
-def _multiply(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.multiply(left, right)
-
-
-def _divide(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.divide(left, right)
-
-
-def _rshift(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype='int32')
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.right_shift(left, right)
-
-
 def _clip(children, attrs, odtype='float32'):
     a_min = attrs.get_float('a_min')
     a_max = attrs.get_float('a_max')
@@ -255,9 +173,6 @@ def broadcast_to(children, attrs, odtype='float32'):
     rconst = relay.Constant(nd.array(array))
     return op.broadcast_to_like(data, rconst)
 
-def _copy(children, attrs, odtype='float32'):
-    return op.copy(children[0])
-
 
 def _global_avg_pool2d(children, attrs, odtype='float32'):
     data = children[0]
@@ -309,42 +224,10 @@ def _full_like(children, attrs, odtype='float32'):
     return op.full_like(children[0], fill_value)
 
 
-def _greater(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type')
-    if out_type:
-        return op.greater(children[0], children[1]).astype(out_type)
-    else:
-        return op.greater(children[0], children[1])
-
-
-def _greater_equal(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.greater_equal(children[0], children[1]).astype(out_type)
-    else:
-        return op.greater_equal(children[0], children[1])
-
-
-def _less(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.less(children[0], children[1]).astype(out_type)
-    else:
-        return op.less(children[0], children[1])
-
-
-def _less_equal(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.less_equal(children[0], children[1]).astype(out_type)
-    else:
-        return op.less_equal(children[0], children[1])
-
-
 def _strided_slice(children, attrs, odtype='float32'):
     begin = attrs.get_int_list('begin')
     end = attrs.get_int_list('end')
-    strides = attrs.get_int_list('strides', None)
+    strides = attrs.get_int_list('stride', None)
     return op.strided_slice(children[0], begin, end, strides=strides)
 
 
@@ -358,14 +241,11 @@ def _split(children, attrs, odtype='float32'):
 
     axis = attrs.get_int('axis', 0)
 
-    return op.split(children[0], indices_or_sections, axis)
+    return op.split(children[0], indices_or_sections, axis).astuple()
 
 def _squeeze(children, attrs, odtype='float32'):
-    axis = None
-    try:
-        axis = [attrs.get_int('axis', None)]
-    except ValueError:
-        axis = axis or attrs.get_int_tuple('axis', None)
+    axis = attrs.get_int_tuple('axis', None)
+    axis = [axis] if isinstance(axis, int) else axis
 
     return op.squeeze(children[0], axis)
 
@@ -377,42 +257,110 @@ def _dropout(children, attrs, odtype='float32'):
     rate = attrs.get_float('rate', 0.5)
     return op.nn.dropout(children[0], rate)
 
+def _mean(children, attrs, odtype='float32'):
+    axis = attrs.get_int_tuple('axis', None)
+    keepdims = attrs.get_bool('keepdims')
+
+    return op.mean(children[0], axis, keepdims)
+
+
+def _prelu(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', 1)
+    return op.nn.prelu(children[0], children[1], axis)
+
+
+def _lrn(children, attrs, odtype='float32'):
+    size = attrs.get_int("size", 5)
+    axis = attrs.get_int("axis", 1)
+    bias = attrs.get_float("bias", 2)
+    alpha = attrs.get_float("alpha", 1e-05)
+    beta = attrs.get_float("beta", 0.75)
+    return op.nn.lrn(children[0], size, axis, bias, alpha, beta)
+
+
+def _l2_nomalize(children, attrs, odtype='float32'):
+    eps = attrs.get_float('eps')
+    axis = attrs.get_int_tuple('axis', None)
+    return op.nn.l2_normalize(children[0], eps, axis)
+
+
+def _take(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', None)
+    return op.take(children[0], children[1], axis)
+
+
+def _matmul(children, attrs, odtype='float32'):
+    input_1_t = op.transpose(children[1], axes=(1, 0))
+    return op.nn.dense(children[0], input_1_t)
+
+
+def _collapse_sum(children, attrs, odtype='float32'):
+    for key in ["axis", "keepdims", "exclude"]:
+        if key in attrs.attrs:
+            raise NotImplementedError("Parameter '" + key + "' is not supported.")
+    return op.collapse_sum_like(children[0], children[1])
+
+
+def _not_implemented(new_op):
+    def _impl(children, attrs, odtype='float32'):
+        raise NotImplementedError(str(new_op) + " is not implemented.")
+    return _impl
+
 
 NNVM_OP_2_RELAY_OP = {
     'flatten': _nn_batch_flatten,
     'dense': _dense,
-    'softmax': _nn_softmax,
+    'softmax': _softmax_op(op.nn.softmax),
+    'log_softmax': _softmax_op(op.nn.log_softmax),
     'conv2d': _conv2d,
     'batch_norm': _batch_norm,
     'max_pool2d': _max_pool2d,
     'reshape': _reshape,
     'transpose': _transpose,
     'dropout': _dropout,
+    'mean': _mean,
     # Addition
-    '__add_scalar__': _add,
-    'broadcast_add': _add,
-    'elemwise_add': _add,
+    '__add_scalar__': _binop_scalar(op.add),
+    'broadcast_add' : _rename(op.add),
+    'elemwise_add'  : _rename(op.add),
     # Subtraction
-    '__sub_scalar__': _subtract,
-    '__rsub_scalar__': _rsubtract,
-    'broadcast_sub': _subtract,
-    'elemwise_sub': _subtract,
+    '__sub_scalar__' : _binop_scalar(op.subtract),
+    '__rsub_scalar__': _rbinop_scalar(op.subtract),
+    'broadcast_sub'  : _rename(op.subtract),
+    'elemwise_sub'   : _rename(op.subtract),
     # Multiply
-    '__mul_scalar__': _multiply,
-    'broadcast_mul': _multiply,
-    'elemwise_mul': _multiply,
+    '__mul_scalar__': _binop_scalar(op.multiply),
+    'broadcast_mul' : _rename(op.multiply),
+    'elemwise_mul'  : _rename(op.multiply),
     # Division
-    '__div_scalar__': _divide,
-    'broadcast_div': _divide,
-    'elemwise_div': _divide,
+    '__div_scalar__': _binop_scalar(op.divide),
+    'broadcast_div' : _rename(op.divide),
+    'elemwise_div'  : _rename(op.divide),
+    'broadcast_mod' : _rename(op.mod),
     # Negative
     'negative': _rename("negative"),
+    # Power
+    '__pow_scalar__': _binop_scalar(op.power),
+    '__rpow_scalar__': _rbinop_scalar(op.power),
+    'broadcast_pow': _rename(op.power),
+    # Sum
+    'sum': _reduce(op.sum),
+    'elemwise_sum': _elemwise_sum,
+    'collapse_sum': _collapse_sum,
+    'broadcast_max': _rename(op.maximum),
+    'broadcast_min': _rename(op.minimum),
 
     # Comparsion
-    'greater': _greater,
-    'greater_equal': _greater_equal,
-    'less': _less,
-    'less_equal': _less_equal,
+    'greater': _compare(op.greater),
+    'broadcast_greater': _compare(op.greater),
+    'greater_equal': _compare(op.greater_equal),
+    'broadcast_greater_equal': _compare(op.greater_equal),
+    'less': _compare(op.less),
+    'broadcast_less': _compare(op.less),
+    'less_equal': _compare(op.less_equal),
+    'broadcast_less_equal': _compare(op.less_equal),
+    'broadcast_equal': _compare(op.equal),
+    'broadcast_not_equal': _compare(op.not_equal),
 
     # Activations
     'sigmoid': _rename('sigmoid'),
@@ -421,13 +369,17 @@ def _dropout(children, attrs, odtype='float32'):
     'log': _rename('log'),
     'tanh': _rename('tanh'),
     'leaky_relu': _leaky_relu,
+    'prelu': _prelu,
     'clip': _clip,
     'round': _rename('round'),
     'cast': _cast,
     'expand_dims': _expand_dims,
     'broadcast_to': broadcast_to,
-    '__rshift_scalar__': _rshift,
-    'copy': _copy,
+    '__lshift_scalar__': _binop_scalar(op.left_shift),
+    '__rshift_scalar__': _binop_scalar(op.right_shift),
+    'broadcast_left_shift': _rename(op.left_shift),
+    'broadcast_right_shift': _rename(op.right_shift),
+    'copy': _rename(op.copy),
     'global_avg_pool2d': _global_avg_pool2d,
     'avg_pool2d': _avg_pool2d,
     'conv2d_transpose': _conv2d_transpose,
@@ -438,6 +390,21 @@ def _dropout(children, attrs, odtype='float32'):
     'split': _split,
     'squeeze': _squeeze,
     'concatenate': _concatenate,
+    'abs': _rename(op.abs),
+    'ceil': _rename(op.ceil),
+    'floor': _rename(op.floor),
+    'trunc': _rename(op.trunc),
+    'take': _take,
+    'lrn': _lrn,
+    'l2_normalize': _l2_nomalize,
+    'matmul': _matmul,
+    'zeros_like': _rename(op.zeros_like),
+    'reshape_like': _rename(op.reshape_like),
+    'ones_like': _rename(op.ones_like),
+
+    'expand_like': _not_implemented("expand_like"),
+    'gather_nd': _not_implemented("gather_nd"),
+    'block_grad': _not_implemented("block_grad"),
 }
 
 
diff --git a/nnvm/python/nnvm/top/attr_dict.py b/nnvm/python/nnvm/top/attr_dict.py
index 834fffdd01c2..58561e7d5111 100644
--- a/nnvm/python/nnvm/top/attr_dict.py
+++ b/nnvm/python/nnvm/top/attr_dict.py
@@ -129,14 +129,13 @@ def get_bool(self, key):
         lowercase = self[key].lower()
         if lowercase == "1":
             return True
-        elif lowercase == "0":
+        if lowercase == "0":
             return False
-        elif lowercase == "true":
+        if lowercase == "true":
             return True
-        elif lowercase == "false":
+        if lowercase == "false":
             return False
-        else:
-            raise ValueError("Wrong bool format for key %s" % key)
+        raise ValueError("Wrong bool format for key %s" % key)
 
     def get_str(self, key):
         """Get string from attr dict
diff --git a/nnvm/python/nnvm/top/tensor.py b/nnvm/python/nnvm/top/tensor.py
index e0214d6ddf16..5dae01695e3a 100644
--- a/nnvm/python/nnvm/top/tensor.py
+++ b/nnvm/python/nnvm/top/tensor.py
@@ -140,6 +140,18 @@ def _compute(attrs, x, _):
 reg.register_pattern("__rshift_scalar__", OpPattern.ELEMWISE)
 reg.register_schedule("__rshift_scalar__", _fschedule_broadcast)
 
+# logical_and
+reg.register_pattern("logical_and", OpPattern.ELEMWISE)
+reg.register_schedule("logical_and", _fschedule_broadcast)
+
+# logical_or
+reg.register_pattern("logical_or", OpPattern.ELEMWISE)
+reg.register_schedule("logical_or", _fschedule_broadcast)
+
+# logical_not
+reg.register_pattern("logical_not", OpPattern.ELEMWISE)
+reg.register_schedule("logical_not", _fschedule_broadcast)
+
 # elemwise_add
 reg.register_pattern("elemwise_add", OpPattern.BROADCAST)
 reg.register_schedule("elemwise_add", _fschedule_broadcast)
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 42cb32214abf..948f905f1e2b 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -21,27 +21,6 @@ def schedule_reorg(attrs, outs, target):
 
 reg.register_pattern("yolo_reorg", OpPattern.INJECTIVE)
 
-@reg.register_compute("yolo_region")
-def compute_region(attrs, inputs, _):
-    """Compute definition of region"""
-    n = attrs.get_int("n")
-    classes = attrs.get_int("classes")
-    coords = attrs.get_int("coords")
-    background = attrs.get_int("background")
-    softmax = attrs.get_int("softmax")
-    with tvm.target.create(attrs.get_str("target")):
-        return topi.vision.yolo.region(inputs[0], n, classes, coords,
-                                       background, softmax)
-
-
-@reg.register_schedule("yolo_region")
-def schedule_region(attrs, outs, target):
-    """Schedule definition of region"""
-    with tvm.target.create(target):
-        return topi.generic.vision.schedule_region(outs)
-
-reg.register_pattern("yolo_region", OpPattern.OPAQUE)
-
 # multibox_prior
 @reg.register_schedule("multibox_prior")
 def schedule_multibox_prior(_, outs, target):
@@ -86,21 +65,26 @@ def compute_multibox_transform_loc(attrs, inputs, _):
 reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
 
 # non-maximum suppression
-@reg.register_schedule("nms")
+@reg.register_schedule("non_max_suppression")
 def schedule_nms(_, outs, target):
-    """Schedule definition of nms"""
+    """Schedule definition of non_max_suppression"""
     with tvm.target.create(target):
         return topi.generic.schedule_nms(outs)
 
-@reg.register_compute("nms")
+@reg.register_compute("non_max_suppression")
 def compute_nms(attrs, inputs, _):
-    """Compute definition of nms"""
-    nms_threshold = attrs.get_float('nms_threshold')
+    """Compute definition of non_max_suppression"""
+    return_indices = attrs.get_bool('return_indices')
+    max_output_size = attrs.get_int('max_output_size')
+    iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
-    nms_topk = attrs.get_int('nms_topk')
+    top_k = attrs.get_int('top_k')
+    id_index = attrs.get_int('id_index')
+    invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     with tvm.target.create(attrs.get_str("target")):
-        return topi.vision.nms(inputs[0], inputs[1], nms_threshold,
-                               force_suppress, nms_topk)
+        return topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                               iou_threshold, force_suppress, top_k,
+                                               id_index, return_indices, invalid_to_bottom)
 
-reg.register_pattern("nms", OpPattern.OPAQUE)
+reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index fbeceb17668c..2fd9c44fda66 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -40,6 +40,7 @@ int GetTypeFlag(tvm::Type type) {
   if (type == tvm::UInt(16)) return 8;
   if (type == tvm::UInt(32)) return 9;
   if (type == tvm::UInt(64)) return 10;
+  if (type == tvm::UInt(1)) return 11;
   LOG(FATAL) << "cannot convert " << type;
   return 0;
 }
@@ -68,6 +69,8 @@ Type GetTVMType(int type_flag) {
       return tvm::UInt(32);
     case 10:
       return tvm::UInt(64);
+    case 11:
+      return tvm::UInt(1);
     default:
       LOG(FATAL) << "unknown type_flag=" << type_flag;
       return Float(32);
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index 1a19feabfe8a..8530a5556b64 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -76,8 +76,8 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._register_alter_op_layout")
     if (ret.type_code() == TVMTypeCode::kNull) {
       return false;
     }
-    CHECK_EQ(ret.type_code(), tvm::runtime::extension_class_info<Symbol>::code)
-      << " expected " << "Symbol (code = " << tvm::runtime::extension_class_info<Symbol>::code
+    CHECK_EQ(ret.type_code(), tvm::runtime::extension_type_info<Symbol>::code)
+      << " expected " << "Symbol (code = " << tvm::runtime::extension_type_info<Symbol>::code
       << ") but get code = " << ret.type_code();
     *ret_symbol = *(static_cast<Symbol*>(ret.value().v_handle));
     return true;
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index 0f322f12e9c4..d7ab212f3e9a 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -199,7 +199,7 @@ Graph InferAttr(Graph &&ret,
   ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
   // number of nodes who knows the shape.
   ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
-  return ret;
+  return std::move(ret);
 }
 
 NNVM_REGISTER_PASS(InferShape)
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index e9a556281ff0..f213fa3a19ec 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -726,42 +726,8 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& outputs) {
     const LayoutTransformParam& param = nnvm::get<LayoutTransformParam>(attrs.parsed);
-
-    Layout src_layout(param.src_layout);
-    Layout dst_layout(param.dst_layout);
-
-    if (src_layout == dst_layout) {
-      return Array<Tensor>{ inputs[0] };
-    } else if (!src_layout.defined() || !dst_layout.defined()) {
-      LOG(FATAL) << "cannot convert from/to undefined layout";
-    }
-
-    CHECK(src_layout.convertible(dst_layout)) << "cannot convert from " << param.src_layout
-                                                << " to " << param.dst_layout;
-
-    return Array<Tensor> {
-      topi::layout_transform(inputs[0], outputs[0]->shape, [&](const Array<Var>& dst_indices) {
-        std::vector<Expr> dst_to_src_indices;
-        for (Layout::LayoutDim src_axis : src_layout) {
-          int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_axis));
-          int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_axis));
-          int32_t src_factor = static_cast<int32_t>(src_layout.subsizeof(src_axis));
-          int32_t dst_factor = static_cast<int32_t>(dst_layout.subsizeof(src_axis));
-
-          Expr src_index(dst_indices[dst_major_pos]);
-          if (dst_minor_pos >= 0) {
-            CHECK_GT(dst_factor, 0);
-            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
-          }
-          if (Layout::is_superdim(src_axis) && src_factor > 0) {
-            src_index = src_index / src_factor;
-          } else if (Layout::is_subdim(src_axis) && src_factor > 0) {
-            src_index = src_index % src_factor;
-          }
-          dst_to_src_indices.push_back(src_index);
-        }
-        return Array<Expr>(dst_to_src_indices);
-      })
+    return Array<Tensor>{
+      topi::layout_transform(inputs[0], param.src_layout, param.dst_layout)
     };
 })
 .set_support_level(1);
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 3ee52008eb1c..2d9813e22131 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -361,6 +361,31 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_pow)
       return Array<Tensor>{ topi::power(inputs[0], inputs[1]) };
 });
 
+// logical
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_and)
+.describe(R"code(Elementwise compute the logical AND
+
+)code")
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_and(inputs[0], inputs[1]) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_or)
+.describe(R"code(Elementwise compute the logical OR
+
+)code")
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_or(inputs[0], inputs[1]) };
+});
+
 // negative
 NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
 .describe(R"code(Elemenwise numeric negative
@@ -383,6 +408,19 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
     };
 });
 
+// logical NOT
+NNVM_REGISTER_ELEMWISE_UNARY_OP(logical_not)
+.describe(R"code(Elementwise compute the logical NOT
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(4)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_not(inputs[0]) };
+});
+
 // copy
 NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
 .describe(R"code(Copy tensor to another one.
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index 2680b894255b..e69a7cb2f036 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -19,11 +19,13 @@ using compiler::FTVMCompute;
 using tvm::Tensor;
 using tvm::Array;
 
-DMLC_REGISTER_PARAMETER(NMSParam);
+DMLC_REGISTER_PARAMETER(NonMaximumSuppressionParam);
 
 bool NMSShape(const NodeAttrs& attrs,
               std::vector<TShape> *in_attrs,
               std::vector<TShape> *out_attrs) {
+  const NonMaximumSuppressionParam& param =
+    nnvm::get<NonMaximumSuppressionParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
   TShape dshape = in_attrs->at(0);
   TShape vshape = in_attrs->at(1);
@@ -33,7 +35,14 @@ bool NMSShape(const NodeAttrs& attrs,
     "(batch_size, num_anchors, 6).";
   CHECK_EQ(dshape[0], vshape[0]) << "batch_size mismatch.";
   out_attrs->clear();
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  if (param.return_indices) {
+    TShape oshape = TShape(2);
+    oshape[0] = dshape[0];
+    oshape[1] = dshape[1];
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  } else {
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  }
   return true;
 }
 
@@ -56,15 +65,15 @@ inline bool NMSInferLayout(const NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(nms)
+NNVM_REGISTER_OP(non_max_suppression)
   .describe(R"doc("Non-maximum suppression."
 )doc" NNVM_ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<NMSParam>)
+.set_attr_parser(ParamParser<NonMaximumSuppressionParam>)
 .set_attr<FGetAttrDict>("FGetAttrDict",
-                        ParamGetAttrDict<NMSParam>)
-.add_arguments(NMSParam::__FIELDS__())
+                        ParamGetAttrDict<NonMaximumSuppressionParam>)
+.add_arguments(NonMaximumSuppressionParam::__FIELDS__())
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
diff --git a/nnvm/src/top/vision/yolo/region.cc b/nnvm/src/top/vision/yolo/region.cc
deleted file mode 100644
index 182c9b2ab3bc..000000000000
--- a/nnvm/src/top/vision/yolo/region.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file region.cc
- * \brief Property def of pooling operators.
- */
-#include <nnvm/op.h>
-#include <nnvm/node.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/top/nn.h>
-#include "../../op_common.h"
-#include "region.h"
-
-namespace nnvm {
-namespace top {
-
-NNVM_REGISTER_OP(yolo_region)
-.describe(R"code(Region layer
-)code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_support_level(5)
-.add_argument("data", "Tensor", "Input data")
-.set_attr<FInferType>("FInferType", RegionType<1, 1>)
-.set_attr<FInferShape>("FInferShape", RegionShape<1, 1>)
-.set_attr<FInplaceOption>(
-    "FInplaceOption",
-    [](const NodeAttrs &attrs) {
-      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
-    })
-.set_attr<FGradient>("FGradient", [](const NodePtr &n,
-                                     const std::vector<NodeEntry> &ograds) {
-  return std::vector<NodeEntry>{ograds[0], ograds[0]};
-});
-}  // namespace top
-}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/region.h b/nnvm/src/top/vision/yolo/region.h
deleted file mode 100644
index f9dc87c59c6c..000000000000
--- a/nnvm/src/top/vision/yolo/region.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file region.h
- */
-#ifndef NNVM_TOP_VISION_YOLO_REGION_H_
-#define NNVM_TOP_VISION_YOLO_REGION_H_
-
-#include <string>
-#include <vector>
-#include <utility>
-#include <iostream>
-#include <sstream>
-
-namespace nnvm {
-namespace top {
-
-template <typename AttrType,
-          bool (*is_none)(const AttrType &),
-          bool (*assign)(AttrType *,
-          const AttrType &),
-          bool reverse_infer,
-          std::string (*attr_string)(const AttrType &),
-          int n_in = -1,
-          int n_out = -1>
-inline bool RegionAttr(const nnvm::NodeAttrs &attrs,
-                       std::vector<AttrType> *in_attrs,
-                       std::vector<AttrType> *out_attrs,
-                       const AttrType &none) {
-  AttrType dattr = none;
-  size_t in_size = in_attrs->size();
-  size_t out_size = out_attrs->size();
-  if (n_in != -1) {
-    in_size = static_cast<size_t>(n_in);
-  }
-  if (n_out != -1) {
-    out_size = static_cast<size_t>(n_out);
-  }
-
-  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
-    for (size_t i = 0; i < size; ++i) {
-      if (i == 0)
-        CHECK(assign(&dattr, (*vec)[i]))
-            << "Incompatible attr in node " << attrs.name << " at " << i
-            << "-th " << name << ": "
-            << "expected " << attr_string(dattr) << ", got "
-            << attr_string((*vec)[i]);
-    }
-  };
-  deduce(in_attrs, in_size, "input");
-
-  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
-    for (size_t i = 0; i < size; ++i) {
-      CHECK(assign(&(*vec)[i], dattr))
-          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
-          << name << ": "
-          << "expected " << attr_string(dattr) << ", got "
-          << attr_string((*vec)[i]);
-    }
-  };
-  write(out_attrs, out_size, "output");
-
-  if (is_none(dattr)) {
-    return false;
-  }
-  return true;
-}
-
-template <int n_in, int n_out>
-inline bool RegionShape(const NodeAttrs &attrs,
-                        std::vector<TShape> *in_attrs,
-                        std::vector<TShape> *out_attrs) {
-  if (n_in != -1) {
-    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
-        << " in operator " << attrs.name;
-  }
-  if (n_out != -1) {
-    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
-        << " in operator " << attrs.name;
-  }
-  return RegionAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
-      attrs, in_attrs, out_attrs, TShape());
-}
-
-template <int n_in, int n_out>
-inline bool RegionType(const NodeAttrs &attrs,
-                       std::vector<int> *in_attrs,
-                       std::vector<int> *out_attrs) {
-  if (n_in != -1) {
-    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
-        << " in operator " << attrs.name;
-  }
-  if (n_out != -1) {
-    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
-        << " in operator " << attrs.name;
-  }
-  return RegionAttr<int, type_is_none, type_assign, true, type_string>(
-      attrs, in_attrs, out_attrs, -1);
-}
-}  // namespace top
-}  // namespace nnvm
-#endif  // NNVM_TOP_VISION_YOLO_REGION_H_
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index fc4e62fb7156..6a42047151e5 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -550,7 +550,7 @@ def test_multibox_transform_loc():
     anchors = sym.Variable("anchors")
     transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
                                                                  anchor=anchors)
-    out = sym.nms(data=transform_loc_data, valid_count=valid_count)
+    out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False)
 
     # Manually create test case
     np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
@@ -573,22 +573,22 @@ def test_multibox_transform_loc():
     out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
     tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
-def test_nms():
+def test_non_max_suppression():
     dshape = (1, 5, 6)
     data = sym.Variable("data")
     valid_count = sym.Variable("valid_count", dtype="int32")
-    nms_threshold = 0.7
+    iou_threshold = 0.7
     force_suppress = True
-    nms_topk = 2
-    out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold,
-                  force_suppress=force_suppress, nms_topk=nms_topk)
+    top_k = 2
+    out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
+                                  iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
     target = "llvm"
@@ -726,7 +726,7 @@ def test_argmax():
     test_flip()
     test_multibox_prior()
     test_multibox_transform_loc()
-    test_nms()
+    test_non_max_suppression()
     test_slice_like()
     test_where()
     test_argmax()
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 66ae9d6e9de4..581ae75a4bbc 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -158,7 +158,7 @@ def test_forward_ones():
     ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
     mx_sym = mx.sym.elemwise_add(data, ones)
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
-    
+
 def test_forward_zeros():
     data = mx.sym.var('data')
     zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
@@ -184,7 +184,112 @@ def test_forward_argmin():
     data = mx.sym.var('data')
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
-    
+
+def test_forward_where():
+    cond = mx.sym.var('cond')
+    x = mx.sym.var('x')
+    y = mx.sym.var('y')
+    dshape = (2, 2)
+    dtype = 'float32'
+    mx_sym = mx.sym.where(cond, x, y)
+    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
+    np_x = np.random.uniform(size=dshape).astype(dtype)
+    np_y = np.random.uniform(size=dshape).astype(dtype)
+    mx_cond = mx.nd.array(np_cond)
+    mx_x = mx.nd.array(np_x)
+    mx_y = mx.nd.array(np_y)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
+    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("cond", tvm.nd.array(np_cond))
+        m.set_input("x", tvm.nd.array(np_x))
+        m.set_input("y", tvm.nd.array(np_y))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_slice():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 3))
+    mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
+
+def test_forward_maximum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._maximum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._maximum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_minimum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._minimum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._minimum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -206,4 +311,7 @@ def test_forward_argmin():
     test_forward_zeros_like()
     test_forward_argmax()
     test_forward_argmin()
-    
+    test_forward_where()
+    test_forward_slice()
+    test_forward_maximum()
+    test_forward_minimum()
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index f4ec61979527..b71442d2b9a4 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -777,6 +777,48 @@ def test_forward_pad():
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT")
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT", constant_values=1.0)
 
+#######################################################################
+# Logical operators
+# --------------------
+def test_logical_and():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_and(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_or():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_or(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_xor():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_xor(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_not():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        out = tf.logical_not(in1, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm(in_data1, 'in1:0', 'out:0')
+
+def test_forward_logical():
+    test_logical_and()
+    test_logical_or()
+    test_logical_xor()
+    test_logical_not()
 
 #######################################################################
 # Inception V3
@@ -1205,3 +1247,4 @@ def test_forward_rel_ops():
 
     # Relational ops
     test_forward_rel_ops()
+    test_forward_logical()
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 3c2a7a5f8c9b..5c176f819105 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -223,13 +223,13 @@ def _handle_return_func(x):
 _node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
-RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
+RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
 C_TO_PY_ARG_SWITCH[TypeCode.FUNC_HANDLE] = _wrap_arg_func(
     _handle_return_func, TypeCode.FUNC_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.MODULE_HANDLE] = _wrap_arg_func(
     _return_module, TypeCode.MODULE_HANDLE)
-C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True)
-C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
+C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True, False)
+C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
 
 _CLASS_MODULE = None
 _CLASS_FUNCTION = None
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index 8b88e7dc98ea..da24b9cd41eb 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -4,7 +4,7 @@
 
 import ctypes
 from ..base import _LIB, check_call, c_str
-from ..runtime_ctypes import TVMArrayHandle
+from ..runtime_ctypes import TVMArrayHandle, TVMNDArrayContainerHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
 
@@ -24,11 +24,13 @@ def _from_dlpack(dltensor):
     dltensor = ctypes.py_object(dltensor)
     if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor):
         ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        # enforce type to make sure it works for all ctypes
+        ptr = ctypes.cast(ptr, ctypes.c_void_p)
         handle = TVMArrayHandle()
         check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
         ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
         ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
-        return _make_array(handle, False)
+        return _make_array(handle, False, False)
     raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once")
 
 
@@ -36,6 +38,8 @@ def _dlpack_deleter(pycapsule):
     pycapsule = ctypes.cast(pycapsule, ctypes.py_object)
     if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
         ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        # enforce type to make sure it works for all ctypes
+        ptr = ctypes.cast(ctypes.c_void_p, ptr)
         _LIB.TVMDLManagedTensorCallDeleter(ptr)
         ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
 
@@ -77,9 +81,15 @@ def to_dlpack(self):
         return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
 
 
-def _make_array(handle, is_view):
+def _make_array(handle, is_view, is_container):
+    global _TVM_ND_CLS
     handle = ctypes.cast(handle, TVMArrayHandle)
-    return _CLASS_NDARRAY(handle, is_view)
+    fcreate = _CLASS_NDARRAY
+    if is_container and _TVM_ND_CLS:
+        array_type_info = ctypes.cast(handle, TVMNDArrayContainerHandle).array_type_info.value
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    return fcreate(handle, is_view)
 
 _TVM_COMPATS = ()
 
@@ -91,6 +101,11 @@ def _reg_extension(cls, fcreate):
         RETURN_SWITCH[cls._tvm_tcode] = fret
         C_TO_PY_ARG_SWITCH[cls._tvm_tcode] = _wrap_arg_func(fret, cls._tvm_tcode)
 
+_TVM_ND_CLS = {}
+
+def _reg_ndarray(cls, fcreate):
+    global _TVM_ND_CLS
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
 
 _CLASS_NDARRAY = None
 
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index ac5532835c47..feb2fffebd23 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -2,7 +2,7 @@ from ..base import TVMError
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
 from cpython cimport pycapsule
-from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t
+from libc.stdint cimport int32_t, int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
 cdef enum TVMTypeCode:
@@ -61,6 +61,14 @@ ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
 ctypedef void* NodeHandle
 
+ctypedef struct TVMNDArrayContainer:
+    DLTensor dl_tensor
+    void* manager_ctx
+    void (*deleter)(DLManagedTensor* self)
+    int32_t array_type_info
+
+ctypedef TVMNDArrayContainer* TVMNDArrayContainerHandle
+
 ctypedef int (*TVMPackedCFunc)(
     TVMValue* args,
     int* type_codes,
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index dcbf4c665e66..9995aea6357a 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -33,7 +33,7 @@ cdef int tvm_callback(TVMValue* args,
         if tcode != kArrayHandle:
             pyargs.append(make_ret(value, tcode))
         else:
-            pyargs.append(c_make_array(value.v_handle, True))
+            pyargs.append(c_make_array(value.v_handle, True, False))
     try:
         rv = local_pyfunc(*pyargs)
     except Exception:
@@ -175,7 +175,7 @@ cdef inline object make_ret(TVMValue value, int tcode):
     elif tcode == kFloat:
         return value.v_float64
     elif tcode == kNDArrayContainer:
-        return c_make_array(value.v_handle, False)
+        return c_make_array(value.v_handle, False, True)
     elif tcode == kStr:
         return py_str(value.v_str)
     elif tcode == kBytes:
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 0a507affec1c..4cd6709a0118 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -20,7 +20,7 @@ def _from_dlpack(object dltensor):
         # set name and destructor to be empty
         pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
         pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
-        return c_make_array(chandle, 0)
+        return c_make_array(chandle, False, False)
     raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once")
 
 
@@ -73,8 +73,15 @@ cdef class NDArrayBase:
         return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
 
 
-cdef c_make_array(void* chandle, is_view):
-    ret = _CLASS_NDARRAY(None, is_view)
+cdef c_make_array(void* chandle, is_view, is_container):
+    global _TVM_ND_CLS
+    cdef int32_t array_type_info
+    fcreate = _CLASS_NDARRAY
+    if is_container and len(_TVM_ND_CLS) > 0:
+        array_type_info = (<TVMNDArrayContainerHandle>chandle).array_type_info
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    ret = fcreate(None, is_view)
     (<NDArrayBase>ret).chandle = <DLTensor*>chandle
     return ret
 
@@ -89,11 +96,16 @@ def _reg_extension(cls, fcreate):
     if fcreate:
         _TVM_EXT_RET[cls._tvm_tcode] = fcreate
 
+cdef _TVM_ND_CLS = {}
 
-def _make_array(handle, is_view):
+def _reg_ndarray(cls, fcreate):
+    global _TVM_ND_CLS
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
+
+def _make_array(handle, is_view, is_container):
     cdef unsigned long long ptr
     ptr = ctypes.cast(handle, ctypes.c_void_p).value
-    return c_make_array(<void*>ptr, is_view)
+    return c_make_array(<void*>ptr, is_view, is_container)
 
 cdef object _CLASS_NDARRAY = None
 
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 2579f22e44af..98229c092792 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -32,7 +32,6 @@
 
 class TVMError(Exception):
     """Error thrown by TVM function"""
-    pass
 
 
 def _load_lib():
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index ca1812d4109a..33013a4df5ef 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -51,7 +51,6 @@ class Function(_FunctionBase):
     tvm.register_func: How to register global function.
     tvm.get_global_func: How to get global function.
     """
-    pass
 
 
 class ModuleBase(object):
@@ -207,11 +206,11 @@ def get_global_func(name, allow_missing=False):
     check_call(_LIB.TVMFuncGetGlobal(c_str(name), ctypes.byref(handle)))
     if handle.value:
         return Function(handle, False)
-    else:
-        if allow_missing:
-            return None
-        else:
-            raise ValueError("Cannot find global function %s" % name)
+
+    if allow_missing:
+        return None
+
+    raise ValueError("Cannot find global function %s" % name)
 
 
 
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 6ad2e06939b1..9ef0f498a7a4 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -163,4 +163,4 @@ def find_include_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by tvm/python/update_version.py
-__version__ = "0.5.dev"
+__version__ = "0.6.dev"
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index e49c3b62f473..3c5b170bdca7 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -17,15 +17,18 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+        from ._cy3.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
+        from ._cy3.core import _reg_extension, _reg_ndarray
     else:
-        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+        from ._cy2.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
+        from ._cy2.core import _reg_extension, _reg_ndarray
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+    from ._ctypes.ndarray import _set_class_ndarray, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
+    from ._ctypes.ndarray import _reg_extension, _reg_ndarray
 
 
 def context(dev_type, dev_id=0):
@@ -111,7 +114,7 @@ def empty(shape, dtype="float32", ctx=context(1, 0)):
         ctx.device_type,
         ctx.device_id,
         ctypes.byref(handle)))
-    return _make_array(handle, False)
+    return _make_array(handle, False, False)
 
 
 def from_dlpack(dltensor):
@@ -295,6 +298,7 @@ def free_extension_handle(handle, type_code):
     """
     check_call(_LIB.TVMExtTypeFree(handle, ctypes.c_int(type_code)))
 
+
 def register_extension(cls, fcreate=None):
     """Register a extension class to TVM.
 
@@ -306,21 +310,26 @@ def register_extension(cls, fcreate=None):
     cls : class
         The class object to be registered as extension.
 
+    fcreate : function, optional
+        The creation function to create a class object given handle value.
+
     Note
     ----
-    The registered class is requires one property: _tvm_handle and a class attribute _tvm_tcode.
+    The registered class is requires one property: _tvm_handle.
+
+    If the registered class is a subclass of NDArray,
+    it is required to have a class attribute _array_type_code.
+    Otherwise, it is required to have a class attribute _tvm_tcode.
 
     - ```_tvm_handle``` returns integer represents the address of the handle.
-    - ```_tvm_tcode``` gives integer represents type code of the class.
+    - ```_tvm_tcode``` or ```_array_type_code``` gives integer represents type
+      code of the class.
 
     Returns
     -------
     cls : class
         The class being registered.
 
-    fcreate : function, optional
-        The creation function to create a class object given handle value.
-
     Example
     -------
     The following code registers user defined class
@@ -339,7 +348,13 @@ def __init__(self):
            def _tvm_handle(self):
                return self.handle.value
     """
-    if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
-        raise ValueError("Cannot register create when extension tcode is same as buildin")
-    _reg_extension(cls, fcreate)
+    if issubclass(cls, _NDArrayBase):
+        assert fcreate is not None
+        assert hasattr(cls, "_array_type_code")
+        _reg_ndarray(cls, fcreate)
+    else:
+        assert hasattr(cls, "_tvm_tcode")
+        if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
+            raise ValueError("Cannot register create when extension tcode is same as buildin")
+        _reg_extension(cls, fcreate)
     return cls
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index e86453499faa..9f9c5383e3ba 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -36,16 +36,16 @@ def convert_to_node(value):
     """
     if isinstance(value, _CLASS_NODE_BASE):
         return value
-    elif isinstance(value, bool):
+    if isinstance(value, bool):
         return const(value, 'uint1x1')
-    elif isinstance(value, Number):
+    if isinstance(value, Number):
         return const(value)
-    elif isinstance(value, string_types):
+    if isinstance(value, string_types):
         return _api_internal._str(value)
-    elif isinstance(value, (list, tuple)):
+    if isinstance(value, (list, tuple)):
         value = [convert_to_node(x) for x in value]
         return _api_internal._Array(*value)
-    elif isinstance(value, dict):
+    if isinstance(value, dict):
         vlist = []
         for item in value.items():
             if (not isinstance(item[0], _CLASS_NODE_BASE) and
@@ -54,12 +54,12 @@ def convert_to_node(value):
             vlist.append(item[0])
             vlist.append(convert_to_node(item[1]))
         return _api_internal._Map(*vlist)
-    elif isinstance(value, NodeGeneric):
+    if isinstance(value, NodeGeneric):
         return value.asnode()
-    elif value is None:
+    if value is None:
         return None
-    else:
-        raise ValueError("don't know how to convert type %s to node" % type(value))
+
+    raise ValueError("don't know how to convert type %s to node" % type(value))
 
 
 def const(value, dtype=None):
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index ef5316b5e267..e1b78735a97d 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -240,3 +240,12 @@ class TVMArray(ctypes.Structure):
                 ("byte_offset", ctypes.c_uint64)]
 
 TVMArrayHandle = ctypes.POINTER(TVMArray)
+
+class TVMNDArrayContainer(ctypes.Structure):
+    """TVM NDArray::Container"""
+    _fields_ = [("dl_tensor", TVMArray),
+                ("manager_ctx", ctypes.c_void_p),
+                ("deleter", ctypes.c_void_p),
+                ("array_type_info", ctypes.c_int32)]
+
+TVMNDArrayContainerHandle = ctypes.POINTER(TVMNDArrayContainer)
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 10a97171e58f..7b81f863f6b0 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -136,7 +136,7 @@ def load_json(json_str):
 
 
 def save_json(node):
-    """Load tvm object as json string.
+    """Save tvm object as json string.
 
     Parameters
     ----------
@@ -515,7 +515,7 @@ def decl_buffer(shape,
                 scope="",
                 data_alignment=-1,
                 offset_factor=0):
-    """Decleare a new symbolic buffer.
+    """Declare a new symbolic buffer.
 
     Normally buffer is created automatically during lower and build.
     This is only needed if user want to specify their own buffer layout.
@@ -587,6 +587,49 @@ def decl_buffer(shape,
         data, dtype, shape, strides, elem_offset, name, scope,
         data_alignment, offset_factor)
 
+def layout(layout_str):
+    """Create a layout node from a string.
+
+    Parameters
+    ----------
+    layout_str : str
+        A layout representation is composed of upper cases, lower cases and numbers,
+        where upper case indicates a primal axis and
+        the corresponding lower case with factor size indicates the subordinate axis.
+        For example, NCHW16c can describe a 5-D tensor of
+        [batch_size, channel, height, width, channel_block].
+        Here subordinate axis channel_block=16 is the factor size of
+        the primal axis C (channel).
+
+    Returns
+    -------
+    layout : Layout
+        The created layout
+    """
+    return _api_internal._Layout(layout_str)
+
+def bijective_layout(src_layout, dst_layout):
+    """Create a bijective layout mapping.
+
+    Parameters
+    ----------
+    src_layout : str or Layout
+        source layout.
+
+    dst_layout : str or Layout
+        destination layout.
+
+    Returns
+    -------
+    bijective_layout : BijectiveLayout
+        The created bijective layout
+    """
+    if isinstance(src_layout, str):
+        src_layout = layout(src_layout)
+    if isinstance(dst_layout, str):
+        dst_layout = layout(dst_layout)
+    return _api_internal._BijectiveLayout(src_layout, dst_layout)
+
 def _IterVar(dom, name, iter_type, thread_tag=''):
     """Internal function to create IterVar
 
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index 980c87d90316..3981a4815aeb 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -31,11 +31,180 @@ def max(self):
 @register_node
 class StrideSet(IntSet):
     """Represent set of strided integers"""
-    pass
 
-@register_node
-class ModularSet(IntSet):
+
+@register_node("arith.ModularSet")
+class ModularSet(NodeBase):
     """Represent range of (coeff * x + base) for x in Z """
-    pass
+    def __init__(self, coeff, base):
+        self.__init_handle_by_constructor__(
+            _make_ModularSet, coeff, base)
+
+
+@register_node("arith.ConstIntBound")
+class ConstIntBound(NodeBase):
+    """Represent constant integer bound
+
+    Parameters
+    ----------
+    min_value : int
+        The minimum value of the bound.
+
+    max_value : int
+        The maximum value of the bound.
+    """
+    POS_INF = (1 << 63) - 1
+    NEG_INF = -POS_INF
+
+    def __init__(self, min_value, max_value):
+        self.__init_handle_by_constructor__(
+            _make_ConstIntBound, min_value, max_value)
+
+
+class ConstraintScope:
+    """Constraint scope.
+
+    Parameters
+    ----------
+    fenter : function
+        A function that will be called to create an enter context.
+
+    Note
+    ----
+    Do not create object directly, use Analyzer.constraint_scope
+    """
+    def __init__(self, fenter):
+        self._fenter = fenter
+        self._fexit = None
+
+    def __enter__(self):
+        self._fexit = self._fenter()
+
+    def __exit__(self, ptype, value, trace):
+        self._fexit()
+
+
+class Analyzer:
+    """Integer arithmetic analyzer
+
+    This is a stateful analyzer class that can
+    be used to perform various symbolic integer analysis.
+    """
+    def __init__(self):
+        _mod = _CreateAnalyzer()
+        self._const_int_bound = _mod("const_int_bound")
+        self._const_int_bound_update = _mod("const_int_bound_update")
+        self._bind = _mod("bind")
+        self._modular_set = _mod("modular_set")
+        self._rewrite_simplify = _mod("rewrite_simplify")
+        self._enter_constraint_context = _mod("enter_constraint_context")
+
+    def const_int_bound(self, expr):
+        """Find constant integer bound for expr.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        bound : ConstIntBound
+            The result bound
+        """
+        return self._const_int_bound(expr)
+
+    def modular_set(self, expr):
+        """Find a modular set that expr belongs to.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        result : ModularSet
+            The result.
+        """
+        return self._modular_set(expr)
+
+    def rewrite_simplify(self, expr):
+        """Simplify expression via rewriting rules.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        result : Expr
+            The result.
+        """
+        return self._rewrite_simplify(expr)
+
+    def bind(self, var, expr):
+        """Bind a variable to the expression.
+
+        Parameters
+        ----------
+        var : tvm.Var
+            The variable.
+
+        expr : tvm.Expr
+            The expression.
+        """
+        return self._bind(var, expr)
+
+    def constraint_scope(self, constraint):
+        """Create a constraint scope.
+
+        Parameters
+        ----------
+        constraint : tvm.Expr
+            The constraint expression.
+
+        returns
+        -------
+        scope : ConstraintScope
+            The constraint scope
+
+        Examples
+        --------
+        .. code-block:: python
+
+          x = tvm.var("x")
+          analyzer = tvm.arith.Analyzer()
+          with analzyer.constraint_scope(x % 3 == 0):
+              # constraint in effect
+              assert analyzer.modular_set(x).coeff == 3
+          # constraint no longer in effect
+          assert analyzer.modular_set(x).coeff != 3
+        """
+        def _fenter():
+            return self._enter_constraint_context(constraint)
+        return ConstraintScope(_fenter)
+
+    def update(self, var, info, override=False):
+        """Update infomation about var
+
+        Parameters
+        ----------
+        var : tvm.Var
+            The variable.
+
+        info : tvm.NodeBase
+            Related information.
+
+        override : bool
+            Whether allow override.
+        """
+        if isinstance(info, ConstIntBound):
+            self._const_int_bound_update(var, info, override)
+        else:
+            raise TypeError(
+                "Do not know how to handle type {}".format(type(info)))
+
 
 _init_api("tvm.arith")
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
index f3ba4236ce63..ae48b9ba4c37 100644
--- a/python/tvm/autotvm/measure/executor.py
+++ b/python/tvm/autotvm/measure/executor.py
@@ -69,15 +69,14 @@ def get(self, timeout=None):
 
 class FutureError(RuntimeError):
     """Base error class of all future events"""
-    pass
+
 
 # pylint:disable=redefined-builtin
 class TimeoutError(FutureError):
     """Error raised when a task is timeout."""
-    pass
+
 
 class ExecutionError(FutureError):
     """
     Error raised when future execution crashes or failed.
     """
-    pass
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 5adfae465ce3..c09fc82fb72c 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -83,7 +83,7 @@ def encode(inp, result, protocol='json'):
             "v": AUTOTVM_LOG_VERSION
         }
         return json.dumps(json_dict)
-    elif protocol == 'pickle':
+    if protocol == 'pickle':
         row = (str(inp.target),
                str(base64.b64encode(pickle.dumps([inp.task.name,
                                                   inp.task.args,
@@ -92,8 +92,8 @@ def encode(inp, result, protocol='json'):
                str(base64.b64encode(pickle.dumps(inp.config)).decode()),
                str(base64.b64encode(pickle.dumps(tuple(result))).decode()))
         return '\t'.join(row)
-    else:
-        raise RuntimeError("Invalid log protocol: " + protocol)
+
+    raise RuntimeError("Invalid log protocol: " + protocol)
 
 
 def decode(row, protocol='json'):
@@ -136,7 +136,7 @@ def clean_json_to_python(x):
         result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]])
 
         return inp, result
-    elif protocol == 'pickle':
+    if protocol == 'pickle':
         items = row.split("\t")
         tgt = _target.create(items[0])
         task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
@@ -146,8 +146,8 @@ def clean_json_to_python(x):
         tsk = task.Task(task_tuple[0], task_tuple[1])
         tsk.workload = task_tuple[3]
         return MeasureInput(tgt, tsk, config), MeasureResult(*result)
-    else:
-        raise RuntimeError("Invalid log protocol: " + protocol)
+
+    raise RuntimeError("Invalid log protocol: " + protocol)
 
 
 def load_from_file(filename):
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index f497ddc038db..d2ef480b44ee 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -294,7 +294,8 @@ def load(self, records):
             # use model as key to build best map
             key = (inp.target.model, inp.task.workload)
             if key not in best_by_model:
-                best_by_model[key] = (inp, res)
+                if inp.target.model != 'unknown':
+                    best_by_model[key] = (inp, res)
             else:
                 _, other_res = best_by_model[key]
                 if np.mean(other_res.costs) > np.mean(res.costs):
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 3fb02c6190cf..09f2dd0576a5 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -32,7 +32,6 @@ class InstantiationError(ValueError):
      raised by cfg.raise_error
      e.g. too many unrolling, too many threads in a block
     """
-    pass
 
 
 class TransformSpace(object):
@@ -321,17 +320,17 @@ def _merge_dfs(self, chains, size, tmp_pt, tmp_stack, merged):
         if np.sum(tmp_pt) == size:
             merged.append(list(tmp_stack))
             return
-        else:
-            for i in range(len(chains)):
-                # use i == np.argmax(....) here to take spatial order into consideration
-                # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
-                if (tmp_pt[i] < len(chains[i]) and
-                        (i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))]))):
-                    tmp_stack.append(chains[i][tmp_pt[i]])
-                    tmp_pt[i] += 1
-                    self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
-                    tmp_pt[i] -= 1
-                    tmp_stack.pop()
+
+        for i in range(len(chains)):
+            # use i == np.argmax(....) here to take spatial order into consideration
+            # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
+            if (tmp_pt[i] < len(chains[i]) and
+                    (i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))]))):
+                tmp_stack.append(chains[i][tmp_pt[i]])
+                tmp_pt[i] += 1
+                self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
+                tmp_pt[i] -= 1
+                tmp_stack.pop()
 
 
 class ReorderEntity(object):
@@ -441,7 +440,7 @@ def _generate_space(self, now, tmp_stack):
         if now == self.num_axis:
             # only vectorize inner most dimension
             vec_ct = tmp_stack.count('vec')
-            if vec_ct == 0 or vec_ct == 1:
+            if vec_ct in (0, 1):
                 self.entities.append(AnnotateEntity(list(tmp_stack)))
         else:
             for ann in self.anns[now]:
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 22a15143b96e..a0c992b07347 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -294,7 +294,7 @@ def get_config():
 
 class FlopCalculationError(RuntimeError):
     """Error happens when estimating FLOP for a compute op"""
-    pass
+
 
 def compute_flop(sch):
     """Calculate number of FLOP (floating number operations) of the compute ops in a schedule
@@ -328,29 +328,33 @@ def _count_flop(exp):
             if len(source) != 1:
                 raise FlopCalculationError("Found multiple output in the source of reduce op")
             return num_iter * (_count_flop(combiner[0]) + _count_flop(source[0]))
-        elif isinstance(exp, (expr.FloatImm, expr.IntImm, expr.UIntImm)):
+        if isinstance(exp, (expr.FloatImm, expr.IntImm, expr.UIntImm)):
             return 0
-        elif isinstance(exp, expr.Cast):
+        if isinstance(exp, expr.Cast):
             return _count_flop(exp.value)
-        elif isinstance(exp, expr.Var):
+        if isinstance(exp, expr.Var):
             return 0
-        elif isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod,
-                              expr.Max, expr.Min,
-                              expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
-                              expr.And, expr.Or, expr.Not)):
-            base = 1 if "float" in exp.a.dtype else 0
+        if isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod,
+                            expr.Max, expr.Min,
+                            expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
+                            expr.And, expr.Or, expr.Not)):
+            base = 1
 
             if isinstance(exp, expr.Not):  # unary
                 return base + _count_flop(exp.a)
 
             return base + _count_flop(exp.a) + _count_flop(exp.b)
-        elif isinstance(exp, expr.Select):
+        if isinstance(exp, expr.Select):
             return _count_flop(exp.condition) + max(_count_flop(exp.true_value),
                                                     _count_flop(exp.false_value))
-        elif isinstance(exp, expr.Call):
+        if isinstance(exp, expr.Call):
+            if exp.call_type == expr.Call.Halide:
+                # Ignore flops from indexing expressions.
+                return 0
+
             return sum([_count_flop(x) for x in exp.args])
-        else:
-            raise FlopCalculationError("Found unsupported operator in the compute expr")
+
+        raise FlopCalculationError("Found unsupported operator in the compute expr")
 
     def traverse(ops):
         """accumulate flops"""
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index abd7ec4fad0b..120c97c2c003 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -69,7 +69,7 @@ def update(self, inputs, results):
         results: Array of autotvm.measure.MeasureResult
             result for measurement
         """
-        pass
+
 
     def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         """Begin tuning
diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/util.py
index 2b52bfb46992..528eb24be380 100644
--- a/python/tvm/autotvm/util.py
+++ b/python/tvm/autotvm/util.py
@@ -4,6 +4,8 @@
 import multiprocessing
 import time
 
+from random import randrange
+
 import numpy as np
 
 from .. import expr, ir_pass
@@ -59,9 +61,9 @@ def sample_ints(low, high, m):
     vis = set()
     assert m <= high - low
     while len(vis) < m:
-        new = np.random.randint(low, high)
+        new = randrange(low, high)
         while new in vis:
-            new = np.random.randint(low, high)
+            new = randrange(low, high)
         vis.add(new)
 
     return list(vis)
diff --git a/python/tvm/container.py b/python/tvm/container.py
index ba30255f650a..e384a742c36f 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -90,7 +90,7 @@ class Range(NodeBase):
     You do not need to create Range explicitly.
     Python list and tuple will be converted automatically to Range in api functions.
     """
-    pass
+
 
 @register_node
 class LoweredFunc(NodeBase):
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 0ffa6c420243..ee84da820902 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -85,13 +85,13 @@ def _windows_shared(output, objects, options):
             cl_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("can not found cl.exe,"
+        raise RuntimeError("Can not find cl.exe,"
                            "please run this in Vistual Studio Command Prompt.")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += py_str(out)
         raise RuntimeError(msg)
-    link_cmd = ["link"]
+    link_cmd = ["lld-link"]
     link_cmd += ["-dll", "-FORCE:MULTIPLE"]
 
     for obj in objects:
@@ -111,8 +111,11 @@ def _windows_shared(output, objects, options):
             link_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("can not found link.exe,"
-                           "please run this in Vistual Studio Command Prompt.")
+        raise RuntimeError("Can not find the LLVM linker for Windows (lld-link.exe)."
+                           "Make sure it's installed"
+                           " and the installation directory is in the %PATH% environment "
+                           "variable. Prebuilt binaries can be found at: https://llvm.org/"
+                           "For building the linker on your own see: https://lld.llvm.org/#build")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += py_str(out)
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index a627a32dbd16..725f212fce00 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -89,6 +89,7 @@ def __init__(self, module, ctx, graph_json_str, dump_root):
         self._dump_path = None
         self._debug_run = module["debug_run"]
         self._get_output_by_layer = module["get_output_by_layer"]
+        self._run_individual = module["run_individual"]
         graph_runtime.GraphModule.__init__(self, module)
         self._create_debug_env(graph_json_str, ctx)
 
@@ -222,6 +223,9 @@ def run(self, **input_dict):
         # Step 3. Display the collected information
         self.debug_datum.display_debug_result()
 
+    def run_individual(self, number, repeat=1, min_repeat_ms=0):
+        self._run_individual(number, repeat, min_repeat_ms)
+
     def exit(self):
         """Exits the dump folder and all its contents"""
         self._remove_dump_root()
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 21cc4844087c..f9970f6bceb2 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -151,14 +151,14 @@ def find_libdevice_path(arch):
     selected_ver = 0
     selected_path = None
     cuda_ver = get_cuda_version(cuda_path)
-    if cuda_ver == 9.0 or cuda_ver == 9.1:
+    if cuda_ver in (9.0, 9.1, 10.0):
         path = os.path.join(lib_path, "libdevice.10.bc")
     else:
         for fn in os.listdir(lib_path):
             if not fn.startswith("libdevice"):
                 continue
             ver = int(fn.split(".")[-3].split("_")[-1])
-            if ver > selected_ver and ver <= arch:
+            if selected_ver < ver <= arch:
                 selected_ver = ver
                 selected_path = fn
         if selected_path is None:
diff --git a/python/tvm/contrib/verilog.py b/python/tvm/contrib/verilog.py
index 358366684fa4..f904a0cb01bf 100644
--- a/python/tvm/contrib/verilog.py
+++ b/python/tvm/contrib/verilog.py
@@ -118,8 +118,7 @@ def _find_vpi_path():
     vpi_found = [p for p in vpi_path if os.path.exists(p) and os.path.isfile(p)]
     if vpi_found:
         return os.path.dirname(vpi_found[0])
-    else:
-        raise ValueError("Cannot find tvm_vpi.vpi, make sure you did `make verilog`")
+    raise ValueError("Cannot find tvm_vpi.vpi, make sure you did `make verilog`")
 
 def search_path():
     """Get the search directory."""
diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
index 84ae537d49ab..56a73f784fa0 100644
--- a/python/tvm/hybrid/calls.py
+++ b/python/tvm/hybrid/calls.py
@@ -4,6 +4,7 @@
 from .. import api as _api
 from .. import expr as _expr
 from .. import make as _make
+from .. import target as _tgt
 from ..container import Array
 from .. import ir_pass
 from ..stmt import For
@@ -45,8 +46,8 @@ def bind(func_id, args):
     _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!")
     _internal_assert(isinstance(args[0], str), \
                      "A loop bind's first argument should be a string!")
-    iter_var = _api.thread_axis(args[0])
     low, ext = _api.const(0, "int32"), args[1]
+    iter_var = _api.thread_axis((low, ext), args[0])
     for_type = None
     return iter_var, low, ext, for_type
 
@@ -123,7 +124,7 @@ def ceil_div(func_id, args):
     _internal_assert(isinstance(args[0], _expr.Expr), "Only expressions can div")
     _internal_assert(isinstance(args[1], _expr.Expr), "Only expressions can div")
     a, b = args[0], args[1]
-    return (a + b - 1) / b
+    return (a + b - 1) // b
 
 
 def likely(func_id, args):
@@ -131,3 +132,14 @@ def likely(func_id, args):
                      "Only one expression can be likely")
     _internal_assert(func_id == "likely", "This function cannot be directly invoked!")
     return call_pure_intrin(args[0].dtype, 'likely', *args)
+
+
+def max_num_threads(func_id, args):
+    _internal_assert(func_id == "max_num_threads", "This function cannot be directly invoked!")
+    _internal_assert(args.__len__() <= 1, "At most one argument accepted!")
+    if args.__len__() == 0:
+        res = _tgt.current_target().max_num_threads
+    else:
+        _internal_assert(isinstance(args[0], _expr.UIntImm), "In tvm bool should be uint")
+        res = _tgt.current_target(args[0].value).max_num_threads
+    return _api.convert(res)
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index b9d64866b305..67a6f6632d16 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -12,7 +12,7 @@
 from .util import _internal_assert
 from . import calls
 from . import util
-from .var_decl import determine_variable_usage
+from .preprocessor import determine_variable_usage
 from ..api import all as _all
 from ..api import any as _any
 from ..container import Array
@@ -61,6 +61,7 @@ class Symbol(Enum):
     BufferVar = 7
     LoopVar = 8
     ConstLoopVar = 9
+    ThreadBind = 10
 
 
 class HybridParser(ast.NodeVisitor):
@@ -117,7 +118,10 @@ def __init__(self, args, usage, symbols, func_name=None):
         self.symbols = {} # Symbol table
         for k, v in symbols.items():
             if isinstance(v, types.FunctionType):
-                self.symbols[k] = Symbol.Callable, v
+                self.add_symbol(k, Symbol.Callable, v)
+
+        self.binds = {} # Thread binds
+        self.device = 0 # Is it generating device
 
         self.func_name = func_name # The name of the function to be lowered
         self.outputs = [] # Output tensors' name
@@ -126,6 +130,25 @@ def __init__(self, args, usage, symbols, func_name=None):
         self.returned = False # If this function has a valid return
 
 
+    def add_symbol(self, key, ty, val): #pylint: disable=invalid-name
+        """Add value to the symbol table context"""
+        if key in self.symbols.keys():
+            old = str(self.symbols[key])
+            new = str((ty, val))
+            _internal_assert(False,
+                             "Name conflict in symbol table! [%s] %s -> %s" % (key, old, new))
+
+        self.symbols[key] = ty, val
+
+        if ty == Symbol.ThreadBind:
+            if val.var.name not in self.binds.keys():
+                self.binds[val.var.name] = val
+                return
+            val_ = self.binds[val.var.name]
+            _internal_assert(_ir_pass.Equal(val_.dom.extent, val.dom.extent),
+                             "Thread extents should be uniform!")
+            self.symbols[key] = ty, val_
+
 
     def wrap_up_realize(self, node, body):
         """Wrap up all the variables which will no longer be used"""
@@ -141,11 +164,14 @@ def wrap_up_realize(self, node, body):
                 continue
             elif 'Buffer' in ty.name:
                 _buf = entry
-                _scope = ty.name[:-6].lower() if ty is not Symbol.BufferVar else 'global'
+                _scope = 'global' if ty is Symbol.BufferVar else ty.name[:-6].lower()
                 to_pop.append(key)
             else:
                 continue
 
+            if _scope == 'global':
+                body = self.wrap_up_binds(body)
+
             _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
             _dtype = _buf.dtype
             _true = _api.convert(True)
@@ -158,6 +184,14 @@ def wrap_up_realize(self, node, body):
         return body
 
 
+    def wrap_up_binds(self, body):
+        for _, iter_var in self.binds.items():
+            ext = iter_var.dom.extent
+            body = _make.AttrStmt(iter_var, 'thread_extent', ext, body)
+        self.binds = {}
+        return body
+
+
     #pylint: disable=invalid-name, missing-docstring
     def visit_Module(self, node):
         _internal_assert(len(node.body) == 1, \
@@ -173,10 +207,10 @@ def visit_FunctionDef(self, node):
             self.func_name = node.name
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
-            self.symbols[getattr(arg, _attr)] = (Symbol.Input, self.args[idx])
+            self.add_symbol(getattr(arg, _attr), Symbol.Input, self.args[idx])
         res = visit_list_to_block(self.visit, node.body)
         res = self.wrap_up_realize(node, res)
-        return res
+        return self.wrap_up_binds(res)
 
 
     def visit_Expr(self, node):
@@ -185,13 +219,17 @@ def visit_Expr(self, node):
 
     def visit_Name(self, node):
         name = node.id
+        if sys.version_info[0] == 2 and name in ['True', 'False']:
+            return _api.convert(eval(name)) #pylint: disable=eval-used
         ty, entry = self.symbols[name]
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
         if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
             return entry
-        elif ty is Symbol.ConstVar:
+        if ty is Symbol.ThreadBind:
+            return entry.var
+        if ty is Symbol.ConstVar:
             return entry if isinstance(node.ctx, ast.Load) else None
-        elif ty is Symbol.BufferVar:
+        if ty is Symbol.BufferVar:
             if isinstance(node.ctx, ast.Load):
                 return _make.Call(entry.dtype, entry.name, [_api.const(0, 'int32')], \
                                   _expr.Call.Halide, entry.op, entry.value_index)
@@ -212,6 +250,10 @@ def visit_Num(self, node):
         return _api.const(node.n, dtype)
 
 
+    def visit_NameConstant(self, node):
+        return _api.convert(node.value)
+
+
     def visit_AugAssign(self, node):
         buf = self.visit(node.target)
         rhs = self.visit(node.value)
@@ -237,7 +279,7 @@ def visit_Assign(self, node):
             for i in range(rhs.num_outputs):
                 _internal_assert(isinstance(node.targets[i], ast.Name),
                                  "You should bind a pure name to the tensors")
-                self.symbols[node.targets[i].id] = Symbol.GlobalBuffer, rhs.output(i)
+                self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
                 rmap[rhs.outputs[i].op] = rhs.output(i)
             return util.replace_io(rhs.body, rmap)
 
@@ -260,26 +302,30 @@ def visit_Assign(self, node):
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
                     ph = _api.placeholder(shape, dtype=dtype, name=lhs)
-                    self.symbols[lhs] = getattr(Symbol, scope.title() + "Buffer"), ph
+                    self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
                     if scope == 'output':
                         self.outputs.append(lhs)
                     return util.make_nop()
                 if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
-                    self.symbols[lhs] = Symbol.ConstVar, rhs
+                    self.add_symbol(lhs, Symbol.ConstVar, rhs)
                 else:
+                    _internal_assert(self.device == 0,
+                                     "Single variable not supported in devices' side!\n" + \
+                                     "If you are using GPU, please allocate a 'local' spad " + \
+                                     "outside the bind body")
                     ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
-                    self.symbols[lhs] = Symbol.BufferVar, ph
+                    self.add_symbol(lhs, Symbol.BufferVar, ph)
             lhs = self.visit(lhs_)
             if lhs is not None:
                 buf, args = lhs
                 return _make.Provide(buf.op, 0, rhs, args)
             return util.make_nop()
-        else:
-            lhs, args = self.visit(lhs)
-            _internal_assert(isinstance(lhs, Tensor), \
-                             "An array access's LHS is expected to be a expr.Call!")
-            res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
-            return res
+
+        lhs, args = self.visit(lhs)
+        _internal_assert(isinstance(lhs, Tensor), \
+                         "An array access's LHS is expected to be a expr.Call!")
+        res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
+        return res
 
 
     def visit_Index(self, node):
@@ -347,7 +393,7 @@ def visit_If(self, node):
         if isinstance(cond, _expr.UIntImm):
             if cond.value:
                 return visit_list_to_block(self.visit, node.body)
-            elif node.orelse:
+            if node.orelse:
                 return visit_list_to_block(self.visit, node.orelse)
             return util.make_nop()
 
@@ -356,7 +402,7 @@ def visit_If(self, node):
         if node.orelse:
             else_body = visit_list_to_block(self.visit, node.orelse)
         else:
-            else_body = util.make_nop()
+            else_body = None
         return _make.IfThenElse(cond, if_body, else_body)
 
 
@@ -410,17 +456,18 @@ def visit_Call(self, node):
 
         func_id = node.func.id
         args = [self.visit(i) for i in node.args]
-        try:
+        # Intrinsics'
+        if hasattr(calls, func_id):
             return getattr(calls, func_id)(func_id, args)
-        except AttributeError:
-            _internal_assert(func_id in self.symbols.keys(), \
-                             "The function called is not in the context either!")
-            ty, entry = self.symbols[func_id]
-            _internal_assert(ty is Symbol.Callable, \
-                             "Are you sure what you call is a function?!")
-            outs = entry(*args)
-            op = outs.op if isinstance(outs, Tensor) else outs[0].op
-            return op
+        # Contexts'
+        _internal_assert(func_id in self.symbols.keys(), \
+                         "The function called (%s) is not in the context either!" % func_id)
+        ty, entry = self.symbols[func_id]
+        _internal_assert(ty is Symbol.Callable, \
+                         "Are you sure what you call is a function?!")
+        outs = entry(*args)
+        op = outs.op if isinstance(outs, Tensor) else outs[0].op
+        return op
 
 
     def visit_For(self, node):
@@ -445,28 +492,31 @@ def visit_For(self, node):
 
             bodies = []
             for i in range(low, low + ext):
-                self.symbols[_name] = Symbol.ConstLoopVar, i
+                self.add_symbol(_name, Symbol.ConstLoopVar, i)
                 body = visit_list_to_block(self.visit, node.body)
                 body = self.wrap_up_realize(node, body)
                 bodies.append(body)
+                self.symbols.pop(_name)
             return concat_list_to_block(bodies)
 
-        elif iter_var is None:
-            _internal_assert(for_type is not None, "The loop bind function parse error!")
+        if iter_var is None:
+            _internal_assert(for_type is not None, "The loop iterating function parse error!")
             offset = iter_var = _api.var(_name)
             if not _ir_pass.Equal(low, _api.const(0, 'int32')):
                 offset = iter_var + low
-            self.symbols[_name] = Symbol.LoopVar, offset
+            self.add_symbol(_name, Symbol.LoopVar, offset)
             _body = visit_list_to_block(self.visit, node.body)
         else:
-            _internal_assert(for_type is None, "The loop iterating function parse error!")
-            self.symbols[_name] = Symbol.LoopVar, iter_var.var
+            _internal_assert(for_type is None, "The loop bind function parse error!")
+            self.add_symbol(_name, Symbol.ThreadBind, iter_var)
+            self.device += 1
             _body = visit_list_to_block(self.visit, node.body)
+            self.device -= 1
 
         _body = self.wrap_up_realize(node, _body)
 
         if for_type is None:
-            res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
+            res = _body
         else:
             _internal_assert(not isinstance(for_type, tuple), \
                             "Micro expansion should be handled before!")
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/preprocessor.py
similarity index 95%
rename from python/tvm/hybrid/var_decl.py
rename to python/tvm/hybrid/preprocessor.py
index 50b610567c74..a83fb2eae287 100644
--- a/python/tvm/hybrid/var_decl.py
+++ b/python/tvm/hybrid/preprocessor.py
@@ -59,6 +59,9 @@ def visit_AugAssign(self, node):
 
 
     def visit_Name(self, node):
+        # If it is True or False, we do not worry about it!
+        if sys.version_info[0] == 2 and node.id in ['True', 'False']:
+            return
         # If it is from the argument list or loop variable, we do not worry about it!
         if node.id in self._args.keys():
             return
diff --git a/python/tvm/hybrid/runtime.py b/python/tvm/hybrid/runtime.py
index 293e069c24ea..b3c744f42652 100644
--- a/python/tvm/hybrid/runtime.py
+++ b/python/tvm/hybrid/runtime.py
@@ -1,6 +1,7 @@
 """Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
 
 import numpy
+from .. import target
 
 
 class bind(object): #pylint: disable=invalid-name
@@ -72,34 +73,40 @@ def sigmoid(x):
     return 1 / (1 + numpy.exp(-x))
 
 
+def max_num_threads(allow_none=True):
+    """Get max number of threads for GPU targets."""
+    return target.current_target(allow_none).max_num_threads
+
+
 HYBRID_GLOBALS = {
-    'unroll'       : range,
-    'vectorize'    : range,
-    'parallel'     : range,
-    'const_range'  : range,
-    'bind'         : bind,
-    'allocate'     : allocate,
-    'output_tensor': allocate,
-    'sqrt'         : numpy.sqrt,
-    'log'          : numpy.log,
-    'tanh'         : numpy.tanh,
-    'power'        : numpy.power,
-    'exp'          : numpy.exp,
-    'sigmoid'      : sigmoid,
-    'popcount'     : popcount,
-    'likely'       : lambda cond: cond,
-    'uint8'        : numpy.uint8,
-    'uint16'       : numpy.uint16,
-    'uint32'       : numpy.uint32,
-    'uint64'       : numpy.uint64,
-    'int8'         : numpy.int8,
-    'int16'        : numpy.int16,
-    'int32'        : numpy.int32,
-    'int64'        : numpy.int64,
-    'float16'      : numpy.float16,
-    'float32'      : numpy.float32,
-    'float64'      : numpy.float64,
-    'ceil_div'     : lambda a, b: (a + b - 1) / b
+    'unroll'         : range,
+    'vectorize'      : range,
+    'parallel'       : range,
+    'const_range'    : range,
+    'bind'           : bind,
+    'allocate'       : allocate,
+    'output_tensor'  : allocate,
+    'sqrt'           : numpy.sqrt,
+    'log'            : numpy.log,
+    'tanh'           : numpy.tanh,
+    'power'          : numpy.power,
+    'exp'            : numpy.exp,
+    'sigmoid'        : sigmoid,
+    'popcount'       : popcount,
+    'likely'         : lambda cond: cond,
+    'uint8'          : numpy.uint8,
+    'uint16'         : numpy.uint16,
+    'uint32'         : numpy.uint32,
+    'uint64'         : numpy.uint64,
+    'int8'           : numpy.int8,
+    'int16'          : numpy.int16,
+    'int32'          : numpy.int32,
+    'int64'          : numpy.int64,
+    'float16'        : numpy.float16,
+    'float32'        : numpy.float32,
+    'float64'        : numpy.float64,
+    'ceil_div'       : lambda a, b: (a + b - 1) // b,
+    'max_num_threads': max_num_threads
 }
 
 
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 56190a82765e..dcccaa465883 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -60,7 +60,7 @@ def replace(op):
         if isinstance(op, _stmt.Provide) and op.func in rmap.keys():
             buf = rmap[op.func]
             return _make.Provide(buf.op, op.value_index, op.value, op.args)
-        elif isinstance(op, _expr.Call) and  op.func in rmap.keys():
+        if isinstance(op, _expr.Call) and  op.func in rmap.keys():
             buf = rmap[op.func]
             return _make.Call(buf.dtype, buf.name, op.args, \
                               _expr.Call.Halide, buf.op, buf.value_index)
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index bb15c314ff23..a0dabfc8a6e2 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -495,7 +495,7 @@ def _rule_float_suffix(op):
     """
     if op.dtype == "float32":
         return call_pure_extern(op.dtype, "%sf" % op.name, *op.args)
-    elif op.dtype == "float64":
+    if op.dtype == "float64":
         return call_pure_extern(op.dtype, op.name, *op.args)
     return op
 
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 6238fd7f1789..780bdc246508 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -56,7 +56,7 @@ def static_cast(dtype, expr):
     if target_type.type_code == src_type.type_code and src_type.bits == target_type.bits:
         if src_type.lanes == target_type.lanes:
             return expr
-        elif src_type.lanes == 1 and target_type.lanes > 1:
+        if src_type.lanes == 1 and target_type.lanes > 1:
             return Broadcast(expr, target_type.lanes)
     return Cast(dtype, expr)
 
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index b35c3de63918..567aff6fba9c 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -16,7 +16,7 @@
 class NDArray(NDArrayBase):
     """Lightweight NDArray class of TVM runtime.
 
-    Strictly this is only an Array Container(a buffer object)
+    Strictly this is only an Array Container (a buffer object)
     No arthimetic operations are defined.
     All operations are performed by TVM functions.
 
@@ -24,7 +24,6 @@ class NDArray(NDArrayBase):
     Instead, this is a minimal data structure to demonstrate
     how can we use TVM in existing project which might have their own array containers.
     """
-    pass
 
 
 def cpu(dev_id=0):
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index fe00877c0fb0..6d44d07f4bbf 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -13,6 +13,7 @@
 from . import prelude
 from . import parser
 from . import debug
+from . import param_dict
 
 # Root operators
 from .op import Op
@@ -85,3 +86,7 @@
 
 # Parser
 fromtext = parser.fromtext
+
+# Param Serialization
+save_param_dict = param_dict.save_param_dict
+load_param_dict = param_dict.load_param_dict
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index c0455a3361e9..9fdffab4e62e 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -43,8 +43,8 @@ def __init__(self, message):
     from antlr4.tree.Tree import TerminalNode
 except ImportError:
     raise ParseError("Couldn't find ANTLR runtime." +
-                     "Try running `pip{} install antlr4-python{}-runtime`."
-                     .format(PYTHON_VERSION, PYTHON_VERSION))
+                     "Try running `pip{version} install antlr4-python{version}-runtime`."
+                     .format(version=PYTHON_VERSION))
 
 BINARY_OPS = {
     RelayParser.MUL: op.multiply,
@@ -179,33 +179,31 @@ def visitTerminal(self, node):
         # variables
         if node_type == RelayLexer.GLOBAL_VAR:
             return lookup(deque([self.global_var_scope]), node_text[1:])
-        elif node_type == RelayLexer.LOCAL_VAR:
+        if node_type == RelayLexer.LOCAL_VAR:
             # Remove the leading '%' and lookup the name.
             var = lookup(self.var_scopes, name)
             if var is None:
                 raise ParseError("Couldn't resolve `{}`.".format(name))
             return var
-        elif node_type == RelayLexer.GRAPH_VAR:
+        if node_type == RelayLexer.GRAPH_VAR:
             try:
                 return self.graph_expr[int(name)]
             except IndexError:
                 raise ParseError("Couldn't resolve `{}`".format(name))
 
         # data types
-        elif node_type == RelayLexer.NAT:
+        if node_type == RelayLexer.NAT:
             return int(node_text)
-        elif node_type == RelayLexer.FLOAT:
+        if node_type == RelayLexer.FLOAT:
             return float(node_text)
-        elif node_type == RelayLexer.BOOL_LIT:
+        if node_type == RelayLexer.BOOL_LIT:
             if node_text == "True":
                 return True
-            elif node_text == "False":
+            if node_text == "False":
                 return False
-            else:
-                raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
+            raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
 
-        else:
-            raise ParseError("todo: {}".format(node_text))
+        raise ParseError("todo: {}".format(node_text))
 
     def visit_list(self, ctx_list):
         # type: (List[ParserRuleContext]) -> List[Any]
diff --git a/python/tvm/relay/adt.py b/python/tvm/relay/adt.py
index bc516a8f3ddb..abf78d565d62 100644
--- a/python/tvm/relay/adt.py
+++ b/python/tvm/relay/adt.py
@@ -8,7 +8,7 @@
 
 class Pattern(RelayNode):
     """Base type for pattern matching constructs."""
-    pass
+
 
 @register_relay_node
 class PatternWildcard(Pattern):
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 1f7ab18677c4..c101ed43469e 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -10,7 +10,6 @@
 class CachedFunc(NodeBase):
     """Low-level tensor function to back a relay primitive function.
     """
-    pass
 
 
 @register_relay_node
@@ -34,7 +33,6 @@ def __init__(self, source_func, target):
 class CCacheValue(NodeBase):
     """Value in the CompileEngine, including usage statistics.
     """
-    pass
 
 
 def _get_cache_key(source_func, target):
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 1d50a571a460..e927df22b201 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -49,7 +49,6 @@ def __iter__(self):
 @register_relay_node
 class Closure(Value):
     """A closure produced by the interpreter."""
-    pass
 
 
 @register_relay_node
@@ -96,7 +95,7 @@ def __init__(self, value):
 
 def _arg_to_ast(arg):
     if isinstance(arg, TensorValue):
-        return Constant(arg.data.copyto(_nd.cpu(0)))
+        return Constant(arg.data.copyto(nd.cpu(0)))
     elif isinstance(arg, np.ndarray):
         return Constant(nd.array(arg))
     elif isinstance(arg, Constant):
@@ -251,12 +250,15 @@ def optimize(self, expr):
             The optimized expression.
         """
         # TODO: We need to move this optimization code into the optimizer/pass manager
-        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
+        wrapped_expr = expr if isinstance(expr, Function) else Function([], expr)
+        if self.mod:
+            self.mod[self.mod.entry_func] = wrapped_expr
+        ck_expr = ir_pass.infer_type(wrapped_expr, mod=self.mod)
         simp_expr = ir_pass.simplify_inference(ck_expr)
         ck_simp = ir_pass.infer_type(simp_expr, mod=self.mod)
         fused_expr = ir_pass.fuse_ops(ck_simp)
         ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
-        return ck_fused
+        return ck_fused if isinstance(expr, Function) else Call(ck_fused, [])
 
     def _make_executor(self, expr):
         def _interp_wrapper(*args, **kwargs):
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 9641e0fd6fef..e0784d53ee47 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -9,7 +9,7 @@
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
-from . import expr
+from . import expr as _expr
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
 
@@ -21,6 +21,8 @@
     "CombineParallelConv2D": 3,
     "FoldScaleAxis": 3,
     "AlterOpLayout": 3,
+    "CanonicalizeOps": 3,
+    "EliminateCommonSubexpr": 3,
 }
 
 
@@ -125,8 +127,8 @@ def _bind_params_by_name(func, params):
         arg = name_dict[k]
         if arg is None:
             raise ValueError("Multiple args in the function have name %s" % k)
-        bind_dict[arg] = expr.const(v)
-    return expr.bind(func, bind_dict)
+        bind_dict[arg] = _expr.const(v)
+    return _expr.bind(func, bind_dict)
 
 
 def optimize(func, target=None, params=None):
@@ -161,6 +163,16 @@ def optimize(func, target=None, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
+    if cfg.pass_enabled("EliminateCommonSubexpr"):
+        def fskip(expr):
+            if isinstance(expr, _expr.Call) and expr.op.name == 'cast' and \
+               expr.attrs.dtype == 'int32':
+                return True
+            return False
+
+        func = ir_pass.infer_type(func)
+        func = ir_pass.eliminate_common_subexpr(func, fskip)
+
     if cfg.pass_enabled("CombineParallelConv2D"):
         func = ir_pass.infer_type(func)
         func = ir_pass.combine_parallel_conv2d(func)
@@ -177,13 +189,15 @@ def optimize(func, target=None, params=None):
         func = ir_pass.forward_fold_scale_axis(func)
         func = ir_pass.fold_constant(func)
 
+    if cfg.pass_enabled("CanonicalizeOps"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.canonicalize_ops(func)
+
     # FIXME(zhiics) Skip AlterOpLayout pass for heterogeneous compilation for
     # now. We probably need to pass target to this pass as well. Fix it in
     # a followup PR.
     if cfg.pass_enabled("AlterOpLayout"):
         if isinstance(target, _target.Target):
-            func = ir_pass.infer_type(func)
-            func = ir_pass.canonicalize_ops(func)
             func = ir_pass.infer_type(func)
             with target:
                 func = ir_pass.alter_op_layout(func)
@@ -444,7 +458,6 @@ def create_executor(kind="debug",
         target = _target.create(target)
     if kind == "debug":
         return _interpreter.Interpreter(mod, ctx, target)
-    elif kind == "graph":
+    if kind == "graph":
         return GraphExecutor(mod, ctx, target)
-    else:
-        raise RuntimeError("unknown mode {0}".format(mode))
+    raise RuntimeError("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 9257bad7dd58..bd28acc9e4b5 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -51,6 +51,9 @@ def astype(self, dtype):
         """
         return _make.cast(self, dtype)
 
+    def __neg__(self):
+        return _op_make.negative(self)
+
     def __add__(self, other):
         if isinstance(other, Expr):
             return _op_make.add(self, other)
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
index 69d3c3642cfe..5533eec2134b 100755
--- a/python/tvm/relay/frontend/caffe2.py
+++ b/python/tvm/relay/frontend/caffe2.py
@@ -15,8 +15,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
@@ -104,9 +103,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        else:
-            raise NotImplementedError('{} not implemented'.format(
-                cls.__name__))
+        raise NotImplementedError('{} not implemented'.format(
+            cls.__name__))
 
 
 _caffe2_internal_args = [
@@ -234,11 +232,10 @@ def _get_axis_from_order_str(order):
             order = order if isinstance(order, str) else order.decode('UTF-8')
             if order == 'NCHW':
                 return 1
-            elif order == 'NHWC':
+            if order == 'NHWC':
                 return 3
-            else:
-                raise RuntimeError(
-                    "Unsupported storage order: {} in caffe2".format(order))
+            raise RuntimeError(
+                "Unsupported storage order: {} in caffe2".format(order))
 
         return AttrCvt(
             op_name='concatenate',
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 4011e29c761f..ef9f63f3cd95 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -10,7 +10,6 @@
 
 class RequiredAttr(object):
     """Dummpy class to represent required attr"""
-    pass
 
 
 class StrAttrsDict(object):
@@ -107,7 +106,7 @@ def get_int_tuple(self, key, default=RequiredAttr()):
         """
         if key in self.attrs:
             tshape = self.attrs[key]
-            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(','))
+            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(',') if x)
         if isinstance(default, RequiredAttr):
             raise AttributeError("Required attribute {} not found.".format(key))
         return default
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index ba2c6dead71e..a4f9b39b70e2 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -100,37 +100,37 @@ def _ActivationParams(op, inexpr, etab):
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         return _op.add(_op.multiply(inexpr, alpha), beta)
-    elif whichActivation == 'ReLU':
+    if whichActivation == 'ReLU':
         return _op.nn.relu(inexpr)
-    elif whichActivation == 'leakyReLU':
+    if whichActivation == 'leakyReLU':
         _op.nn.leaky_relu(inexpr, alpha=_expr.const(par.alpha, dtype='float32'))
     elif whichActivation == 'thresholdedReLU':
         alpha_tensor = _op.full_like(inexpr, fill_value=_expr.const(par.alpha, dtype='float32'))
         return _op.multiply(inexpr, _op.greater(inexpr, alpha_tensor).as_type('float32'))
-    elif whichActivation == 'PReLU':
+    if whichActivation == 'PReLU':
         return _op.nn.prelu(inexpr, alpha=_expr.const(par.alpha, dtype='float32'))
-    elif whichActivation == 'tanh':
+    if whichActivation == 'tanh':
         return _op.tanh(inexpr)
-    elif whichActivation == 'scaledTanh':
+    if whichActivation == 'scaledTanh':
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         return _op.multiply(_op.tanh(_op.multiply(inexpr, beta)), alpha)
-    elif whichActivation == 'sigmoid':
+    if whichActivation == 'sigmoid':
         return _op.sigmoid(inexpr)
-    elif whichActivation == 'sigmoidHard':
+    if whichActivation == 'sigmoidHard':
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         transformX = (alpha * inexpr) + beta
         return _op.clip(transformX, a_min=0., a_max=1.)
-    elif whichActivation == 'ELU':
+    if whichActivation == 'ELU':
         return _op.multiply(_op.add(_op.exp(inexpr), _expr.const(-1, dtype='float32')),
                             _expr.const(par.alpha, dtype='float32'))
-    elif whichActivation == 'softsign':
+    if whichActivation == 'softsign':
         return inexpr / (_expr.const(1, dtype='float32') + (
             op.nn.relu(inexpr) + _op.nn.relu(_op.negative(inexpr))))
-    elif whichActivation == 'softplus':
+    if whichActivation == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1, dtype='float32')))
-    elif whichActivation == 'parametricSoftplus':
+    if whichActivation == 'parametricSoftplus':
         alpha = list(par.alpha.floatValue)
         beta = list(par.alpha.floatValue)
         if len(alpha) == 1:
@@ -142,8 +142,7 @@ def _ActivationParams(op, inexpr, etab):
         alpha_expr = etab.new_const(alpha)
         beta_expr = etab.new_const(beta)
         return _op.multiply(_op.log(_op.add(_op.exp(inexpr), beta_expr)), alpha_expr)
-    else:
-        raise NotImplementedError('%s not implemented' % whichActivation)
+    raise NotImplementedError('%s not implemented' % whichActivation)
 
 
 def _ScaleLayerParams(op, inexpr, etab):
@@ -163,10 +162,9 @@ def _PoolingLayerParams(op, inexpr, etab):
     if op.globalPooling:
         if op.type == 0:
             return _op.nn.global_max_pool2d(inexpr)
-        elif op.type == 1:
+        if op.type == 1:
             return _op.nn.global_avg_pool2d(inexpr)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -196,10 +194,9 @@ def _PoolingLayerParams(op, inexpr, etab):
 
         if op.type == 0:
             return _op.nn.max_pool2d(inexpr, **params)
-        elif op.type == 1:
+        if op.type == 1:
             return _op.nn.avg_pool2d(inexpr, **params)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
 
 def _SoftmaxLayerParams(op, inexpr, etab):
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 8a2d3d58d01c..2be03c80c20b 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -60,21 +60,21 @@ def _convert_activation(inexpr, keras_layer, _):
         alpha = _expr.const(alpha, dtype='float32')
         beta = _expr.const(beta, dtype='float32')
         return _op.add(_op.multiply(inexpr, alpha), beta)
-    elif act_type == 'softmax':
+    if act_type == 'softmax':
         return _op.nn.softmax(inexpr, axis=1)
-    elif act_type == 'sigmoid':
+    if act_type == 'sigmoid':
         return _op.sigmoid(inexpr)
-    elif act_type == 'tanh':
+    if act_type == 'tanh':
         return _op.tanh(inexpr)
-    elif act_type == 'relu':
+    if act_type == 'relu':
         return _op.nn.relu(inexpr)
-    elif act_type == 'softplus':
+    if act_type == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32')))
-    elif act_type == 'elu':
+    if act_type == 'elu':
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') else 1.
         alpha = _expr.const(alpha, dtype='float32')
         return _get_elu(inexpr, alpha)
-    elif act_type == 'selu':
+    if act_type == 'selu':
         # Alpha, Gamma values obtained from https://arxiv.org/abs/1706.02515
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') \
             else 1.6732632423543772848170429916717
@@ -83,15 +83,15 @@ def _convert_activation(inexpr, keras_layer, _):
         alpha = _expr.const(alpha, dtype='float32')
         gamma = _expr.const(gamma, dtype='float32')
         return gamma * _get_elu(inexpr, alpha)
-    elif act_type == 'relu6':
+    if act_type == 'relu6':
         return _op.clip(inexpr, a_min=0., a_max=6.)
-    elif act_type == 'softsign':
+    if act_type == 'softsign':
         return inexpr / (_expr.const(1., dtype='float32') + _op.abs(inexpr))
-    elif act_type == 'hard_sigmoid':
+    if act_type == 'hard_sigmoid':
         x = (_expr.const(0.2, dtype='float32') * inexpr) + _expr.const(0.5, dtype='float32')
         return _op.clip(x, a_min=0., a_max=1.)
-    else:
-        raise TypeError("Unsupported activation type : {}".format(act_type))
+
+    raise TypeError("Unsupported activation type : {}".format(act_type))
 
 
 def _convert_advanced_activation(inexpr, keras_layer, etab):
@@ -100,25 +100,25 @@ def _convert_advanced_activation(inexpr, keras_layer, etab):
         if keras_layer.max_value:
             return _op.clip(inexpr, a_min=0., a_max=float(keras_layer.max_value))
         return _op.nn.relu(inexpr)
-    elif act_type == 'LeakyReLU':
+    if act_type == 'LeakyReLU':
         return _op.nn.leaky_relu(inexpr, alpha=float(keras_layer.alpha))
-    elif act_type == 'ELU':
+    if act_type == 'ELU':
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') else 1.
         alpha = _expr.const(alpha, dtype='float32')
         return _get_elu(inexpr, alpha)
-    elif act_type == 'PReLU':
+    if act_type == 'PReLU':
         assert hasattr(keras_layer, 'alpha'), "alpha required for PReLU."
         _check_data_format(keras_layer)
         size = len(keras_layer.alpha.shape)
         alpha = etab.new_const(keras_layer.get_weights()[0] \
                                .transpose(np.roll(range(size), 1)))
         return _op.negative(alpha) * _op.nn.relu(_op.negative(inexpr)) + _op.nn.relu(inexpr)
-    elif act_type == 'ThresholdedReLU':
+    if act_type == 'ThresholdedReLU':
         theta = keras_layer.theta if hasattr(keras_layer, 'theta') else 1.
         return _op.multiply(inexpr, _op.greater(inexpr, \
             _expr.const(theta, dtype='float32')).astype('float32'))
-    else:
-        raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+
+    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
 
 
 def _convert_merge(inexpr, keras_layer, _):
@@ -297,31 +297,29 @@ def _convert_pooling(inexpr, keras_layer, etab):
     # global pool in keras = global pool + flatten in nnvm/relay
     if pool_type == 'GlobalMaxPooling2D':
         return _convert_flatten(_op.nn.global_max_pool2d(inexpr), keras_layer, etab)
-    elif pool_type == 'GlobalAveragePooling2D':
+    if pool_type == 'GlobalAveragePooling2D':
         return _convert_flatten(_op.nn.global_avg_pool2d(inexpr), keras_layer, etab)
+    pool_h, pool_w = keras_layer.pool_size
+    stride_h, stride_w = keras_layer.strides
+    params = {'pool_size': [pool_h, pool_w],
+              'strides': [stride_h, stride_w],
+              'padding': [0, 0]}
+    if keras_layer.padding == 'valid':
+        pass
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+        params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        pool_h, pool_w = keras_layer.pool_size
-        stride_h, stride_w = keras_layer.strides
-        params = {'pool_size': [pool_h, pool_w],
-                  'strides': [stride_h, stride_w],
-                  'padding': [0, 0]}
-        if keras_layer.padding == 'valid':
-            pass
-        elif keras_layer.padding == 'same':
-            in_h = keras_layer.input_shape[1]
-            in_w = keras_layer.input_shape[2]
-            pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
-            pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
-        else:
-            raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
-        if pool_type == 'MaxPooling2D':
-            return _op.nn.max_pool2d(inexpr, **params)
-        elif pool_type == 'AveragePooling2D':
-            params['count_include_pad'] = False
-            return _op.nn.avg_pool2d(inexpr, **params)
-        else:
-            raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    if pool_type == 'MaxPooling2D':
+        return _op.nn.max_pool2d(inexpr, **params)
+    if pool_type == 'AveragePooling2D':
+        params['count_include_pad'] = False
+        return _op.nn.avg_pool2d(inexpr, **params)
+    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
 
 
 def _convert_upsample(inexpr, keras_layer, _):
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f8b51c413193..93bd8efc6752 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -39,7 +39,7 @@ def _mx_fully_connected(inputs, attrs):
 def _get_channel_axis(layout, op_name):
     if layout == "NCHW":
         return 1
-    elif layout == "NHWC":
+    if layout == "NHWC":
         return 3
     raise RuntimeError("layout: {} is not supported in {}".format(layout, op_name))
 
@@ -49,11 +49,11 @@ def _mx_activations(inputs, attrs):
     assert len(inputs) == 1
     if act_type == "sigmoid":
         return _op.sigmoid(inputs[0])
-    elif act_type == "tanh":
+    if act_type == "tanh":
         return _op.tanh(inputs[0])
-    elif act_type == "relu":
+    if act_type == "relu":
         return _op.nn.relu(inputs[0])
-    elif act_type == "softrelu":
+    if act_type == "softrelu":
         def _stable_softrelu(x):
             # log(1 + exp(-abs(x))) + relu(x)
             one = _expr.const(1, dtype="float32")
@@ -64,6 +64,13 @@ def _stable_softrelu(x):
     raise RuntimeError("Do not support act_type: {}".format(act_type))
 
 
+def _mx_compare(new_op, wrapper):
+    def impl(inputs, attrs):
+        dtype = ir_pass.infer_type(inputs[0]).checked_type.dtype
+        return wrapper(new_op)(inputs, attrs).astype(dtype)
+    return impl
+
+
 def _mx_conv2d(inputs, attrs):
     kernel_size = attrs.get_int_tuple("kernel")
     if len(kernel_size) != 2:
@@ -147,7 +154,7 @@ def _pool2d(new_op, is_avg):
         if global_pool:
             return _op.nn.global_max_pool2d(inputs[0])
         return _pool2d(_op.nn.max_pool2d, False)
-    elif pool_type == "avg":
+    if pool_type == "avg":
         if global_pool:
             return _op.nn.global_avg_pool2d(inputs[0])
         return _pool2d(_op.nn.avg_pool2d, True)
@@ -159,6 +166,10 @@ def _mx_dropout(inputs, attrs):
     return _op.nn.dropout(inputs[0], rate=rate)
 
 
+def _mx_BlockGrad(inputs, attrs): #pylint: disable=unused-argument
+    return inputs
+
+
 def _mx_batch_norm(inputs, attrs):
     if attrs.get_bool("output_mean_var", False):
         raise RuntimeError("batch_norm do not support output_mean_var")
@@ -172,6 +183,59 @@ def _mx_batch_norm(inputs, attrs):
     return _op.nn.batch_norm(*inputs, **new_attrs)
 
 
+def _mx_slice(inputs, attrs):
+    new_attrs = {}
+    begin = attrs.get_int_tuple('begin', None)
+    end = attrs.get_int_tuple('end', None)
+    stride = attrs.get_int_tuple('step', None)
+    if begin is None or end is None:
+        raise RuntimeError("begin and end are required parameters.")
+    if None in begin or None in end:
+        raise RuntimeError("None in begin or end is not supported yet.")
+    new_attrs = {'begin': begin, 'end': end}
+    if stride is not None:
+        new_attrs['strides'] = stride
+    return _op.strided_slice(inputs[0], **new_attrs)
+
+
+def _mx_slice_like(inputs, attrs):
+    assert len(inputs) == 2
+    new_attrs = {}
+    new_attrs["axes"] = attrs.get_int_tuple("axes", None)
+    return _op.slice_like(*inputs, **new_attrs)
+
+
+def _mx_slice_axis(inputs, attrs):
+    assert len(inputs) == 1
+    shape = ir_pass.infer_type(inputs[0]).checked_type.shape
+    axis = attrs.get_int("axis")
+    ax_beg = attrs.get_int("begin")
+    ax_end = attrs.get_str("end")
+    if axis < 0:
+        axis += len(shape)
+    assert axis >= 0 and axis < len(shape)
+    if ax_end == "None":
+        ax_end = int(shape[axis])
+    else:
+        ax_end = int(ax_end)
+    if ax_beg < 0:
+        ax_beg += int(shape[axis])
+    if ax_end < 0:
+        ax_end += int(shape[axis])
+    assert ax_beg >= 0 and ax_beg < int(shape[axis])
+    assert ax_end > ax_beg and ax_end <= int(shape[axis])
+    begin = []
+    end = []
+    for i, dim in enumerate(shape):
+        if i != axis:
+            begin.append(0)
+            end.append(dim)
+        else:
+            begin.append(ax_beg)
+            end.append(ax_end)
+    return _op.strided_slice(inputs[0], begin, end)
+
+
 def _mx_split(inputs, attrs):
     axis = attrs.get_int("axis", 1)
     new_attrs = {}
@@ -200,6 +264,11 @@ def _mx_concat(inputs, attrs):
     return _op.concatenate(tuple(inputs), axis=axis)
 
 
+def _mx_stack(inputs, attrs):
+    axis = attrs.get_int("axis", 0)
+    return _op.stack(tuple(inputs), axis=axis)
+
+
 def _mx_expand_dims(inputs, attrs):
     axis = attrs.get_int("axis")
     return _op.expand_dims(inputs[0], axis=axis)
@@ -209,10 +278,10 @@ def _mx_leaky_relu(inputs, attrs):
     act_type = attrs.get_str("act_type")
     if act_type == "leaky":
         return _op.nn.leaky_relu(inputs[0], alpha=attrs.get_float("slope", 0.25))
-    elif act_type == "prelu":
+    if act_type == "prelu":
         assert len(inputs) == 2
         return _op.nn.prelu(*inputs)
-    elif act_type == "elu":
+    if act_type == "elu":
         # -slope * relu(1-exp(x)) + relu(x)
         slope = attrs.get_float("slope", 0.25)
         one = _expr.const(1, dtype="float32")
@@ -220,7 +289,7 @@ def _mx_leaky_relu(inputs, attrs):
         mslope = _op.nn.relu(_op.subtract(one, _op.exp(x)))
         mslope = _op.multiply(mslope, _expr.const(-slope, dtype="float32"))
         return _op.add(mslope, _op.nn.relu(x))
-    elif act_type == "rrelu":
+    if act_type == "rrelu":
         # NOTE this is only converted for inference.
         lower_bound = attrs.get_float("lower_bound")
         upper_bound = attrs.get_float("upper_bound")
@@ -229,6 +298,51 @@ def _mx_leaky_relu(inputs, attrs):
     raise RuntimeError("act_type: {} is not supported".format(act_type))
 
 
+def _mx_make_power(power):
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _expr.const(power, dtype=None)
+        # Note: int maps to "int32", float maps to "float32"
+        return _op.power(inputs[0], scalar)
+    return _impl
+
+
+def _mx_make_exponent(base):
+    # exp(b, x) = e^b * e^x
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _op.exp(_expr.const(base, dtype="float32"))
+        return _op.multiply(inputs[0], scalar)
+    return _impl
+
+
+def _mx_make_logarithm(base):
+    # log(b, x) = log(x) / log(b)
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _op.log(_expr.const(base, dtype="float32"))
+        return _op.divide(inputs[0], scalar)
+    return _impl
+
+
+def _mx_expm1():
+    # exp_minus_1 x = exp(x) - 1
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        one = _expr.const(1, dtype="float32")
+        return _op.log(_op.subtract(inputs[0], one))
+    return _impl
+
+
+def _mx_log1p():
+    # 1_plus_log x = log(x + 1)
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        one = _expr.const(1, dtype="float32")
+        return _op.log(_op.add(inputs[0], one))
+    return _impl
+
+
 def _mx_lrn(inputs, attrs):
     new_attrs = {}
     new_attrs["alpha"] = attrs.get_float("alpha", 0.0001)
@@ -259,13 +373,53 @@ def _mx_multibox_detection(inputs, attrs):
                                                                   0.2, 0.2))
 
     new_attrs1 = {}
-    new_attrs1["overlap_threshold"] = attrs.get_float("nms_threshold", 0.5)
+    new_attrs1["return_indices"] = False
+    new_attrs1["iou_threshold"] = attrs.get_float("nms_threshold", 0.5)
     new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
-    new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
+    new_attrs1["top_k"] = attrs.get_int("nms_topk", -1)
 
     ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
                                             inputs[2], **new_attrs0)
-    return _op.vision.nms(ret[0], ret[1], **new_attrs1)
+    return _op.vision.non_max_suppression(ret[0], ret[1], **new_attrs1)
+
+
+def _mx_batch_dot(inputs, attrs):
+    assert len(inputs) == 2
+    a, b = inputs
+    transpose_a = attrs.get_bool("transpose_a", False)
+    transpose_b = attrs.get_bool("transpose_b", False)
+    if transpose_a is True:
+        raise RuntimeError("batch_dot: only support transpose_a=False")
+    if transpose_b is False:
+        b = _op.transpose(b, axes=[0, 2, 1])
+    return _op.batch_matmul(a, b)
+
+
+def _mx_arange(inputs, attrs):
+    assert len(inputs) == 0
+    if attrs.get_int("repeat", 1) != 1:
+        raise RuntimeError("arange doesn't support repeat")
+    new_attrs = {}
+    new_attrs["start"] = attrs.get_float("start", 0)
+    new_attrs["stop"] = attrs.get_float("stop")
+    new_attrs["step"] = attrs.get_float("step", 1)
+    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    return _op.arange(**new_attrs)
+
+
+def _mx_repeat(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["repeats"] = attrs.get_int("repeats")
+    new_attrs["axis"] = attrs.get_int("axis", 0)
+    return _op.repeat(inputs[0], **new_attrs)
+
+
+def _mx_tile(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["reps"] = attrs.get_int_tuple("reps")
+    return _op.tile(inputs[0], **new_attrs)
 
 
 def _mx_roi_align(inputs, attrs):
@@ -277,6 +431,63 @@ def _mx_roi_align(inputs, attrs):
     return _op.vision.roi_align(inputs[0], inputs[1], **new_attrs)
 
 
+def _mx_proposal(inputs, attrs):
+    new_attrs = {}
+    new_attrs["scales"] = attrs.get_float_tuple("scales", (4.0, 8.0, 16.0, 32.0))
+    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (0.5, 1.0, 2.0))
+    new_attrs["feature_stride"] = attrs.get_int("feature_stride", 16)
+    new_attrs["threshold"] = attrs.get_float("threshold", 0.7)
+    new_attrs["rpn_pre_nms_top_n"] = attrs.get_int("rpn_pre_nms_top_n", 6000)
+    new_attrs["rpn_post_nms_top_n"] = attrs.get_int("rpn_post_nms_top_n", 300)
+    new_attrs["rpn_min_size"] = attrs.get_int("rpn_min_size", 16)
+    new_attrs["iou_loss"] = attrs.get_bool("iou_loss", False)
+    assert not attrs.get_bool("output_score", False), "proposal doesn't support output score"
+    return _op.vision.proposal(inputs[0], inputs[1], inputs[2], **new_attrs)
+
+
+def _mx_box_nms(inputs, attrs):
+    force_suppress = attrs.get_bool("force_suppress", False)
+    iou_thresh = attrs.get_float('overlap_thresh', 0.5)
+    top_k = attrs.get_int('topk', -1)
+    valid_thresh = attrs.get_float('valid_thresh', 0)
+    coord_start = attrs.get_int('coord_start', 2)
+    score_index = attrs.get_int('score_index', 1)
+    id_index = attrs.get_int('id_index', -1)
+    in_format = attrs.get_str('in_format', 'corner')
+    out_format = attrs.get_str('out_format', 'corner')
+    if coord_start != 2:
+        raise RuntimeError('coord_start %s is not supported.' % coord_start)
+    if score_index != 1:
+        raise RuntimeError('score_index %s is not supported.' % score_index)
+    if id_index != -1 and int(id_index) != 0:
+        raise RuntimeError('id_index %s is not supported.' % id_index)
+    if in_format != 'corner':
+        raise RuntimeError('in_format %s is not supported.' % in_format)
+    if out_format != 'corner':
+        raise RuntimeError('out_format %s is not supported.' % out_format)
+
+    ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    nms_out = _op.vision.non_max_suppression(ret[1],
+                                             ret[0],
+                                             iou_threshold=iou_thresh,
+                                             force_suppress=force_suppress,
+                                             top_k=top_k,
+                                             id_index=id_index,
+                                             return_indices=False,
+                                             invalid_to_bottom=True)
+    return nms_out
+
+
+def _mx_l2_normalize(inputs, attrs):
+    new_attrs = {}
+    mode = attrs.get_str('mode', 'instance')
+    if mode != 'channel':
+        raise RuntimeError('mode %s is not supported.' % mode)
+    new_attrs['eps'] = attrs.get_float('eps', 1e-10)
+    new_attrs['axis'] = [1]
+    return _op.nn.l2_normalize(inputs[0], **new_attrs)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -284,42 +495,73 @@ def _mx_roi_align(inputs, attrs):
     "exp",
     "sigmoid",
     "tanh",
-    "exp",
     "negative",
     "reshape_like",
-    "slice_like",
     "zeros_like",
     "ones_like",
+    "where",
 ]
 
 _convert_map = {
-    "_copy"         : _rename(_op.copy),
-    "relu"          : _rename(_op.nn.relu),
-    "broadcast_add" : _rename(_op.add),
-    "broadcast_sub" : _rename(_op.subtract),
-    "broadcast_mul" : _rename(_op.multiply),
-    "broadcast_div" : _rename(_op.divide),
-    "elemwise_add"  : _rename(_op.add),
-    "elemwise_sub"  : _rename(_op.subtract),
-    "elemwise_mul"  : _rename(_op.multiply),
-    "elemwise_div"  : _rename(_op.divide),
-    "flatten"       : _rename(_op.nn.batch_flatten),
-    "Flatten"       : _rename(_op.nn.batch_flatten),
-    "_plus_scalar"  : _binop_scalar(_op.add),
-    "__add_scalar__": _binop_scalar(_op.add),
-    "__sub_scalar__": _binop_scalar(_op.subtract),
-    "_minus_scalar" : _binop_scalar(_op.subtract),
-    "__mul_scalar__": _binop_scalar(_op.multiply),
-    "_mul_scalar"   : _binop_scalar(_op.multiply),
-    "__div_scalar__": _binop_scalar(_op.divide),
-    "_div_scalar"   : _binop_scalar(_op.divide),
-    "__pow_scalar__": _binop_scalar(_op.power),
-    "_rminus_scalar": _rbinop_scalar(_op.subtract),
-    "__rsub_scalar__": _rbinop_scalar(_op.subtract),
-    "_rdiv_scalar"  : _rbinop_scalar(_op.divide),
-    "__rdiv_scalar__"  : _rbinop_scalar(_op.divide),
-    "__rpow_scalar__": _rbinop_scalar(_op.power),
+    "_copy"                  : _rename(_op.copy),
+    "relu"                   : _rename(_op.nn.relu),
+    "broadcast_add"          : _rename(_op.add),
+    "broadcast_sub"          : _rename(_op.subtract),
+    "broadcast_mul"          : _rename(_op.multiply),
+    "broadcast_div"          : _rename(_op.divide),
+    "broadcast_mod"          : _rename(_op.mod),
+    "broadcast_maximum"      : _rename(_op.maximum),
+    "broadcast_minimum"      : _rename(_op.minimum),
+    "broadcast_equal"        : _mx_compare(_op.equal, _rename),
+    "broadcast_not_equal"    : _mx_compare(_op.not_equal, _rename),
+    "broadcast_greater"      : _mx_compare(_op.greater, _rename),
+    "broadcast_greater_equal": _mx_compare(_op.greater_equal, _rename),
+    "broadcast_lesser"       : _mx_compare(_op.less, _rename),
+    "broadcast_lesser_equal" : _mx_compare(_op.less_equal, _rename),
+    "elemwise_add"           : _rename(_op.add),
+    "elemwise_sub"           : _rename(_op.subtract),
+    "elemwise_mul"           : _rename(_op.multiply),
+    "elemwise_div"           : _rename(_op.divide),
+    "_maximum"               : _rename(_op.maximum),
+    "_minimum"               : _rename(_op.minimum),
+    "flatten"                : _rename(_op.nn.batch_flatten),
+    "Flatten"                : _rename(_op.nn.batch_flatten),
+    # scalar power
+    "square"                 : _mx_make_power(2),
+    "sqrt"                   : _mx_make_power(1/2),
+    "rsqrt"                  : _mx_make_power(-1/2),
+    "cbrt"                   : _mx_make_power(1/3),
+    "rcbrt"                  : _mx_make_power(-1/3),
+    "__pow_scalar__"         : _binop_scalar(_op.power),
+    "_power_scalar"          : _binop_scalar(_op.power),
+    "__rsub_scalar__"        : _rbinop_scalar(_op.subtract),
+    "_rminus_scalar"         : _rbinop_scalar(_op.subtract),
+    "__rdiv_scalar__"        : _rbinop_scalar(_op.divide),
+    "_rdiv_scalar"           : _rbinop_scalar(_op.divide),
+    "__rpow_scalar__"        : _rbinop_scalar(_op.power),
+    # scalar op
+    "__add_scalar__"         : _binop_scalar(_op.add),
+    "_plus_scalar"           : _binop_scalar(_op.add),
+    "__sub_scalar__"         : _binop_scalar(_op.subtract),
+    "_minus_scalar"          : _binop_scalar(_op.subtract),
+    "__mul_scalar__"         : _binop_scalar(_op.multiply),
+    "_mul_scalar"            : _binop_scalar(_op.multiply),
+    "__div_scalar__"         : _binop_scalar(_op.divide),
+    "_div_scalar"            : _binop_scalar(_op.divide),
+    "log2"                   : _mx_make_logarithm(2),
+    "log10"                  : _mx_make_logarithm(10),
+    "log1p"                  : _mx_log1p,
+    "expm1"                  : _mx_expm1,
+    "_equal_scalar"          : _mx_compare(_op.equal, _binop_scalar),
+    "_not_equal_scalar"      : _mx_compare(_op.not_equal, _binop_scalar),
+    "_greater_scalar"        : _mx_compare(_op.greater, _binop_scalar),
+    "_greater_equal_scalar"  : _mx_compare(_op.greater_equal, _binop_scalar),
+    "_lesser_scalar"         : _mx_compare(_op.less, _binop_scalar),
+    "_lesser_equal_scalar"   : _mx_compare(_op.less_equal, _binop_scalar),
+    "_maximum_scalar"        : _binop_scalar(_op.maximum),
+    "_minimum_scalar"        : _binop_scalar(_op.minimum),
     # reduction ops
+    "mean"          : _reduce(_op.mean),
     "max"           : _reduce(_op.max),
     "min"           : _reduce(_op.min),
     "sum"           : _reduce(_op.sum),
@@ -355,25 +597,37 @@ def _mx_roi_align(inputs, attrs):
     "BatchNorm"     : _mx_batch_norm,
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
+    "L2Normalization"  : _mx_l2_normalize,
+    "slice"         : _mx_slice,
+    "slice_like"    : _mx_slice_like,
+    "slice_axis"    : _mx_slice_axis,
     "SliceChannel"  : _mx_split,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
     "concat"        : _mx_concat,
+    "stack"         : _mx_stack,
+    "batch_dot"     : _mx_batch_dot,
     "LeakyReLU"     : _mx_leaky_relu,
+    "_arange"       : _mx_arange,
+    "repeat"        : _mx_repeat,
+    "tile"          : _mx_tile,
+    "BlockGrad"     : _mx_BlockGrad,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
     # vision
     "_contrib_MultiBoxPrior" : _mx_multibox_prior,
     "_contrib_MultiBoxDetection" : _mx_multibox_detection,
     "_contrib_ROIAlign" : _mx_roi_align,
+    "_contrib_Proposal" : _mx_proposal,
+    "_contrib_MultiProposal" : _mx_proposal,
+    "_contrib_box_nms" : _mx_box_nms,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
     # "broadcast_to",
     # "gather_nd",
     # "Crop"          : _crop_like,
-
 }
 
 # set identity list
@@ -510,6 +764,8 @@ def from_mxnet(symbol,
             params[k] = _nd.array(v.data().asnumpy())
         data = mx.sym.Variable("data")
         sym = symbol(data)
+        if isinstance(sym, (list, tuple)):
+            sym = mx.sym.Group(sym)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
         sym = _from_mxnet_impl(sym, shape, dtype)
     elif isinstance(symbol, mx.gluon.Block):
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
index 3838c3d4aa3b..7fd6f409cfd3 100644
--- a/python/tvm/relay/frontend/nnvm_common.py
+++ b/python/tvm/relay/frontend/nnvm_common.py
@@ -41,7 +41,7 @@ def _impl(inputs, attrs):
 
 def _softmax_op(new_op):
     """softmax/log_softmax"""
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, _dtype='float32'):
         assert len(inputs) == 1
         axis = attrs.get_int("axis", -1)
         return new_op(inputs[0], axis=axis)
@@ -50,13 +50,14 @@ def _impl(inputs, attrs):
 
 def _reduce(new_op):
     """Reduction ops like sum/min/max"""
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, _dtype='float32'):
         assert len(inputs) == 1
         axis = attrs.get_int_tuple("axis", [])
         keepdims = attrs.get_bool("keepdims", False)
+        exclude = attrs.get_bool("exclude", False)
         # use None for reduce over all axis.
         axis = None if len(axis) == 0 else axis
-        return new_op(inputs[0], axis=axis, keepdims=keepdims)
+        return new_op(inputs[0], axis=axis, keepdims=keepdims, exclude=exclude)
     return _impl
 
 
@@ -97,7 +98,7 @@ def _upsampling(inputs, attrs):
     return _op.nn.upsampling(inputs[0], scale=scale)
 
 
-def _elemwise_sum(inputs, _):
+def _elemwise_sum(inputs, _, _dtype='float32'):
     assert len(inputs) > 0
     res = inputs[0]
     for x in inputs[1:]:
@@ -106,20 +107,28 @@ def _elemwise_sum(inputs, _):
 
 
 def _binop_scalar(new_op):
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, odtype='float32'):
         assert len(inputs) == 1
         scalar = attrs.get_float("scalar")
         # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
+        scalar = _expr.const(scalar, dtype=odtype)
         return new_op(inputs[0], scalar)
     return _impl
 
 
 def _rbinop_scalar(new_op):
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, odtype='float32'):
         assert len(inputs) == 1
         scalar = attrs.get_float("scalar")
         # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
+        scalar = _expr.const(scalar, dtype=odtype)
         return new_op(scalar, inputs[0])
     return _impl
+
+
+def _compare(new_op):
+    """Compare ops like greater/less"""
+    def _impl(inputs, _, odtype='float32'):
+        assert len(inputs) == 2
+        return new_op(inputs[0], inputs[1]).astype(odtype)
+    return _impl
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index effe50e06981..d322da31fc19 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -18,8 +18,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 82b4c5b9ca37..8d53b003da1e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -4,6 +4,7 @@
 from __future__ import print_function
 
 import logging
+import warnings
 # Numpy support
 import numpy as np
 
@@ -175,8 +176,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
     return _impl
 
 def _dimension_constraint():
@@ -411,7 +411,7 @@ def _impl(inputs, attr, params):
 def _decode_image():
     def _impl(inputs, attr, params):
         # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer.
-        print("DecodeJpeg: It's a pass through, please handle preprocessing before input")
+        warnings.warn("DecodeJpeg: It's a pass through, please handle preprocessing before input")
         return inputs[0]
     return _impl
 
@@ -522,8 +522,7 @@ def _impl(inputs, attr, params):
                     op_name="reshape",
                     extras={'newshape':tuple(params_new.asnumpy().flatten())},
                     ignores=['Tshape'])(inputs, attr)
-            else:
-                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -851,6 +850,11 @@ def _impl(inputs, attr, params):
                        transforms={'axis': ('axis', 1)})([inputs[0]], attr)
     return _impl
 
+def _logical(name):
+    def _impl(inputs, attr, params):
+        return AttrCvt(op_name=name)(inputs, attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -911,6 +915,9 @@ def _impl(inputs, attr, params):
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
     'Mean'                              : _mean(),
+    'LogicalAnd'                        : _logical('logical_and'),
+    'LogicalOr'                         : _logical('logical_or'),
+    'LogicalNot'                        : _logical('logical_not'),
     'Less'                              : _broadcast('less'),
     'Greater'                           : _broadcast('greater'),
     'LessEqual'                         : _broadcast('less_equal'),
@@ -1172,6 +1179,7 @@ class GraphProto(object):
     def __init__(self):
         self._nodes = {}
         self._params = {}
+        self._input_shapes = {}
         self._output_shapes = {}
         self._num_param = 0
         self._num_rnn_layer = False
@@ -1223,36 +1231,55 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        for node in graph.node:
+            if node.op == 'Placeholder':
+                if shape and node.name in shape:
+                    self._input_shapes[node.name] = list(shape[node.name])
+                    continue
+                self._input_shapes[node.name] = \
+                    tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
+                for idx, dim in enumerate(self._input_shapes[node.name]):
+                    if dim < 0:
+                        self._input_shapes[node.name][idx] = 1
+                        warnings.warn("Use 1 instead of -1 in shape of operator %s."
+                                      % node.name)
+
+                # Ignore user's input shape for Non placeholder
+            elif node.op == 'Const':
+                tensor_value = node.attr['value'].tensor
+                self._input_shapes[node.name] = \
+                    tensor_util.TensorShapeProtoToList(tensor_value.tensor_shape)
+                if shape and node.name in shape:
+                    warnings.warn("Ignore the passed shape. Shape in graphdef "
+                                  "will be used for operator %s." % node.name)
+
         # Parse the nodes to re-create TF graph using Relay operators.
         for node in graph.node:
-            # Tensorflow doesn't have seperate list for params extraction.
+            # Tensorflow doesn't have separate list for params extraction.
             # Operator name 'Const' is treated as a parameter to build params dict.
 
             input_shapes = {}
             attr = self._parse_attr(node.attr)
 
-            #Variable converted to Const will not have only value attr
+            # Variable converted to Const will not have only value attr
             if 'value' in attr and node.op == 'Const':
-                tensor_value = attr['value']
-                self._output_shapes[node.name] = \
-                    [tensor_util.TensorShapeProtoToList( \
-                        tensor_value.tensor_shape)]
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
+            elif shape and node.name in shape:
+                # Give priority to user argument.
+                self._output_shapes[node.name] = [shape[node.name]]
             elif '_output_shapes' in attr:
                 self._output_shapes[node.name] = \
                     [tensor_util.TensorShapeProtoToList(tshape) \
                     for tshape in attr['_output_shapes']]
-            elif shape:
+            else:
                 # Keep the list indexable to avoid key error.
                 # Actual value will be filled after node creation.
                 self._output_shapes[node.name] = [None]
-            else:
-                raise NotImplementedError( \
-                    "Please freeze the graph with add_shapes=True")
 
             if node.op == "Placeholder":
-                self._output_shapes[node.name] = [shape[node.name]]
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
                 self._nodes[node.name] = [_expr.var(node.name,
-                                                    shape=self._output_shapes[node.name][0],
+                                                    shape=self._input_shapes[node.name],
                                                     dtype=attr['dtype'].name)]
 
             elif node.op == "Const":
@@ -1268,7 +1295,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
             else:
                 # Pass the parsed shapes instead
-                attr["_output_shapes"] = self._output_shapes[node.name]
+                attr["_output_shapes"] = output_shapes = self._output_shapes[node.name]
 
                 # Pass the node name too in attr
                 attr["_node_name"] = node.name
@@ -1295,7 +1322,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 op = self._convert_operator(node.op, inputs, attr, graph)
 
-                # Check is op is converted to param
+                # Check if op is converted to param
                 if isinstance(op, np.ndarray):
                     self._params[node.name] = tvm.nd.array(op)
                     op = [_expr.var(node.name,
@@ -1311,6 +1338,14 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 self._nodes[node.name] = op
 
+                # Infer shapes even without specifying "add_shapes=True"
+                if output_shapes == [None]:
+                    out_type = ir_pass.infer_type(self._nodes[node.name][0])
+                    self._output_shapes[node.name] = [get_const_tuple(out_type.checked_type.shape)]
+
+                if self._output_shapes[node.name] and shape and node.name in shape:
+                    assert self._output_shapes[node.name] == list(shape[node.name])
+
             # Infer shapes if passed explicitely
             node_output = self._nodes[node.name]
             out_type = ir_pass.infer_type(node_output[0])
@@ -1385,7 +1420,7 @@ def _parse_param(self, key, value, name, shape):
                                            shape=self._params[name].shape,
                                            dtype=self._params[name].dtype)]
         else:
-            if key != 'dtype' and key != '_output_shapes' and key != '_class':
+            if key not in ('dtype', '_output_shapes', '_class'):
                 raise NotImplementedError \
                     ("Other attributes for a Const(param) Node {} ? .".format(key))
 
diff --git a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
similarity index 93%
rename from nnvm/python/nnvm/frontend/util/tensorflow_parser.py
rename to python/tvm/relay/frontend/tensorflow_parser.py
index 9b745c9d02c9..04c232b6d577 100644
--- a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -7,16 +7,21 @@
 
 
 class TFParser(object):
-    """A Wrapper to handle tensorflow models parsing
-       TensorFlow is needed
-    ```
-    parser = TfParser(model_dir)
-    graph = parser.parse()
-    ```
+    """
+    A Wrapper to handle tensorflow models parsing, TensorFlow is needed
+
     Parameters
     ----------
     model_dir : tensorflow frozen pb file or a directory that contains saved
     model or checkpoints.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        parser = TfParser(model_dir)
+        graph = parser.parse()
+        # graph is related graphdef of the model
     """
 
     def __init__(self, model_dir):
@@ -117,9 +122,14 @@ def _load_ckpt(self):
                            "not supported yet.")
 
     def parse(self):
-        """Parse tensorflow models: checkpoints, saved models, and single pb
-        file.
         """
+        Parse tensorflow models: checkpoints, saved models, and single frozen pb file.
+
+        Returns
+        -------
+        GraphDef of the passed model
+        """
+
         graph = None
 
         if os.path.isdir(self._model_dir):
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d52c941e50f7..d45bb33859b2 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -35,6 +35,8 @@ def __init__(self, model, subgraph, exp_tab):
         self.builtin_op_code = build_str_map(BuiltinOperator())
         self.activation_fn_type = build_str_map(ActivationFunctionType())
         self.builtin_options = build_str_map(BuiltinOptions())
+
+        # Add more operators
         self.convert_map = {
             'CONV_2D': self.convert_conv2d,
             'DEPTHWISE_CONV_2D': self.convert_depthwise_conv2d,
@@ -43,7 +45,7 @@ def __init__(self, model, subgraph, exp_tab):
             'SOFTMAX': self.convert_softmax,
             'SQUEEZE': self.convert_squeeze,
             'MAX_POOL_2D': self.convert_max_pool2d,
-            # Add more operators
+            "CONCATENATION": self.convert_concatenation
         }
 
     def check_unsupported_ops(self):
@@ -126,15 +128,14 @@ def get_tensor_value(self, tensor_wrapper):
         if tensor_wrapper.tensor.Type() == TensorType.UINT8:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.uint8).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        elif tensor_wrapper.tensor.Type() == TensorType.FLOAT32:
+        if tensor_wrapper.tensor.Type() == TensorType.FLOAT32:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.float32).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        elif tensor_wrapper.tensor.Type() == TensorType.INT32:
+        if tensor_wrapper.tensor.Type() == TensorType.INT32:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.int32).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        else:
-            raise NotImplementedError("Not support tensor type {}"
-                                      .format(str(tensor_wrapper.tensor.Type())))
+        raise NotImplementedError("Not support tensor type {}"
+                                  .format(str(tensor_wrapper.tensor.Type())))
 
     def get_tensor_type_str(self, tensor_type):
         """Get tensor type string representation when given TFLite tensor type"""
@@ -145,12 +146,11 @@ def get_tensor_type_str(self, tensor_type):
 
         if tensor_type == TensorType.UINT8:
             return "uint8"
-        elif tensor_type == TensorType.FLOAT32:
+        if tensor_type == TensorType.FLOAT32:
             return "float32"
-        elif tensor_type == TensorType.INT32:
+        if tensor_type == TensorType.INT32:
             return "int32"
-        else:
-            raise NotImplementedError("Not support tensor type {}".format(str(tensor_type)))
+        raise NotImplementedError("Not support tensor type {}".format(str(tensor_type)))
 
     def convert_conv2d(self, op):
         """Convert TFLite conv2d"""
@@ -192,7 +192,7 @@ def convert_reshape(self, op):
 
         in_expr = self.get_expr(input_tensor_idx)
 
-        if input_shape_length == 1 or input_shape_length == 2:
+        if input_shape_length in (1, 2):
             # The rule is channel first (after N but before H, W).
             # length of 1 means N*H*W*C, do nothing.
             # length of 2 means N*H*W, C, do nothing.
@@ -247,6 +247,48 @@ def convert_softmax(self, op):
 
         return out
 
+    def convert_concatenation(self, op):
+        """ convert TFLite concatenation"""
+        try:
+            from tflite.Operator import Operator
+            from tflite.ConcatenationOptions import ConcatenationOptions
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.ActivationFunctionType import ActivationFunctionType
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) >= 1, "input tensors should greater than 1"
+        in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors]
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors should be 1"
+
+        assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions
+        op_options = op.BuiltinOptions()
+        concatenation_options = ConcatenationOptions()
+        concatenation_options.Init(op_options.Bytes, op_options.Pos)
+        concatenation_axis = concatenation_options.Axis()
+        fused_activation_fn = concatenation_options.FusedActivationFunction()
+        input_shape_length = len(input_tensors[0].tensor.ShapeAsNumpy())
+
+        # TFLite is N H W C, our layout is N C H W
+        if input_shape_length <= 4:
+            axis_convert_map = [0] + list(range(2, input_shape_length)) + [1]
+            concatenation_axis = axis_convert_map[concatenation_axis]
+        else:
+            raise NotImplementedError("Not support input shape length {} of concatenatio : "
+                                      .format(str(input_shape_length)))
+
+        # with axis in N H W C
+        out = _op.concatenate(in_exprs, axis=concatenation_axis)
+
+        # if we have activation fn
+        if fused_activation_fn != ActivationFunctionType.NONE:
+            out = self.convert_fused_activation_function(out, fused_activation_fn)
+        return out
+
     def convert_squeeze(self, op):
         """Convert TFLite squeeze"""
         try:
@@ -275,7 +317,7 @@ def convert_squeeze(self, op):
         in_expr = self.get_expr(input_tensor_idx)
 
         # TFLite is N H W C, our layout is N C H W
-        if input_shape_length == 1 or input_shape_length == 2:
+        if input_shape_length in (1, 2):
             # The rule is channel first (after N but before H, W).
             # length of 1 means N*H*W*C, do nothing.
             # length of 2 means N*H*W, C, do nothing.
@@ -299,7 +341,7 @@ def convert_squeeze(self, op):
         # 3: N H W C, reshape to N H*W C, transpose to N C H*W
         # 4: N H W C, transpose to N C H W
         # add more if we need target shapes in future
-        if output_shape_length == 1 or output_shape_length == 2:
+        if output_shape_length in (1, 2):
             pass
         elif output_shape_length == 3:
             out = _op.transpose(out, axes=(0, 2, 1))
@@ -320,16 +362,15 @@ def convert_fused_activation_function(self, in_expr, fused_activation_fn):
         assert fused_activation_fn != ActivationFunctionType.NONE
         if fused_activation_fn == ActivationFunctionType.RELU6:
             return _op.clip(in_expr, a_min=0, a_max=6)
-        elif fused_activation_fn == ActivationFunctionType.RELU:
+        if fused_activation_fn == ActivationFunctionType.RELU:
             return _op.nn.relu(in_expr)
-        elif fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
+        if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
             return _op.clip(in_expr, a_min=-1, a_max=1)
-        elif fused_activation_fn == ActivationFunctionType.TANH:
+        if fused_activation_fn == ActivationFunctionType.TANH:
             return _op.tanh(in_expr)
-        else:
-            fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
-            raise NotImplementedError("Unsupported fused activation fn {}"
-                                      .format(fused_activation_fn_str))
+        fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
+        raise NotImplementedError("Unsupported fused activation fn {}"
+                                  .format(fused_activation_fn_str))
 
     def convert_conv(self, op, conv_type):
         """convolution implementation."""
@@ -401,7 +442,7 @@ def convert_conv(self, op, conv_type):
 
         # weight tensor type should be UINT8 (quantization) or FLOAT32
         weight_tensor_type = weight_tensor.tensor.Type()
-        assert weight_tensor_type == TensorType.UINT8 or weight_tensor_type == TensorType.FLOAT32
+        assert weight_tensor_type in (TensorType.UINT8, TensorType.FLOAT32)
         weight_tensor_type_str = self.get_tensor_type_str(weight_tensor_type)
 
         in_expr = self.get_expr(input_tensor_idx)
@@ -434,7 +475,7 @@ def convert_conv(self, op, conv_type):
             bias_tensor = input_tensors[2]
             bias_tensor_type = bias_tensor.tensor.Type()
             # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type == TensorType.INT32 or bias_tensor_type == TensorType.FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             bias_expr = self.exp_tab.new_const(self.get_tensor_value(bias_tensor),
                                                dtype=bias_tensor_type_str)
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 561c5d388788..2d8e99ae8b25 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -490,7 +490,7 @@ def collect_device_annotation_ops(expr):
     return _ir_pass.CollectDeviceAnnotationOps(expr)
 
 
-def to_anf(expr, mod=None):
+def to_a_normal_form(expr, mod=None):
     """
     Turn Graph Normal Form expression into A Normal Form Expression.
 
@@ -513,12 +513,28 @@ def to_anf(expr, mod=None):
     expr: tvm.relay.Expr
       The output expression.
     """
-    return _ir_pass.to_anf(expr, mod)
+    return _ir_pass.to_a_normal_form(expr, mod)
 
 
-def gradient(expr, mod=None):
+def to_graph_normal_form(expr):
+    """Turn A Normal Form expression into Graph Normal Form expression
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression
+    Returns
+    -------
+    expr : tvm.relay.Expr
+      The output expression
     """
-    Transform a function to return original result paired with gradient of input.
+    return _ir_pass.to_graph_normal_form(expr)
+
+
+def gradient(expr, mod=None, mode='higher_order'):
+    """
+    Transform the input function,
+    returning a function that calculate the original result,
+    paired with gradient of the input.
 
     Parameters
     ----------
@@ -527,12 +543,24 @@ def gradient(expr, mod=None):
 
     mod : Optional[tvm.relay.Module]
 
+    mode : Optional[String]
+        The mode of the automatic differentiation algorithm.
+        'first_order' only work on first order code, but will not produce reference nor closure.
+        'higher_order' work on all code using reference and closure.
+
     Returns
     -------
     expr : tvm.relay.Expr
-      The output expression.
+      The transformed expression.
     """
-    return _ir_pass.first_order_gradient(expr, mod)
+    if mode == 'first_order':
+        return _ir_pass.first_order_gradient(expr, mod)
+    elif mode == 'higher_order':
+        return _ir_pass.gradient(expr, mod)
+    else:
+        raise Exception('unknown mode')
+
+
 
 def get_total_mac_number(expr):
     """
@@ -549,3 +577,23 @@ def get_total_mac_number(expr):
       The number of MACs (multiply-accumulate) of a model
     """
     return _ir_pass.GetTotalMacNumber(expr)
+
+
+def eliminate_common_subexpr(expr, fskip=None):
+    """
+    Eliminate common subexpressions.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    fskip: function
+        The callback function that decides whether an expression should be skipped.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+      The output expression.
+    """
+    return _ir_pass.eliminate_common_subexpr(expr, fskip)
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 13f521dad660..9dd2bf88c934 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -1,9 +1,8 @@
 #pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
-from .op import get, register, register_schedule, register_compute, register_alter_op_layout, \
-    Op
-from .op import debug
+from .op import get, register, register_schedule, register_compute, register_gradient, \
+    register_pattern, register_alter_op_layout, schedule_injective, Op, OpPattern, debug
 
 # Operators
 from .reduce import *
@@ -18,6 +17,7 @@
 
 # operator registry
 from . import _tensor
+from . import _tensor_grad
 from . import _transform
 from . import _reduce
 from ..expr import Expr
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index d9b5e2e89ce0..7f8da03008d2 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -3,25 +3,7 @@
 from __future__ import absolute_import
 import topi
 from .op import register_compute, register_schedule, register_pattern
-from .op import register_gradient
 from .op import schedule_injective, OpPattern
-from .transform import collapse_sum_like
-from .tensor import negative
-
-
-def add_grad(orig, grad):
-    return [collapse_sum_like(grad, orig.args[0]), collapse_sum_like(grad, orig.args[1])]
-
-
-register_gradient("add", add_grad)
-
-
-def subtract_grad(orig, grad):
-    return [collapse_sum_like(grad, orig.args[0]),
-            collapse_sum_like(negative(grad), orig.args[1])]
-
-
-register_gradient("subtract", subtract_grad)
 
 schedule_broadcast = schedule_injective
 schedule_elemwise = schedule_injective
@@ -36,6 +18,7 @@ def subtract_grad(orig, grad):
 register_schedule("round", schedule_broadcast)
 register_schedule("abs", schedule_broadcast)
 register_schedule("tanh", schedule_broadcast)
+register_schedule("logical_not", schedule_broadcast)
 register_schedule("negative", schedule_broadcast)
 register_schedule("copy", schedule_broadcast)
 
@@ -45,6 +28,8 @@ def subtract_grad(orig, grad):
 register_schedule("divide", schedule_broadcast)
 register_schedule("power", schedule_injective)
 register_schedule("mod", schedule_broadcast)
+register_schedule("logical_and", schedule_broadcast)
+register_schedule("logical_or", schedule_broadcast)
 register_schedule("equal", schedule_broadcast)
 register_schedule("not_equal", schedule_broadcast)
 register_schedule("less", schedule_broadcast)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
new file mode 100644
index 000000000000..173e97a00496
--- /dev/null
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -0,0 +1,79 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..expr import const
+from .op import register_gradient
+from .transform import collapse_sum_like, where
+from .tensor import exp, negative, power, less
+from .tensor import zeros_like, ones_like
+
+
+@register_gradient("log")
+def log_grad(orig, grad):
+    """Returns [grad * (1 / x)]"""
+    x = orig.args[0]
+    return [grad * ones_like(x) / x]
+
+
+@register_gradient("exp")
+def exp_grad(orig, grad):
+    """Returns [grad * exp(x)]"""
+    return [grad * exp(orig.args[0])]
+
+
+@register_gradient("sqrt")
+def sqrt_grad(orig, grad):
+    """Returns [grad * 0.5 * (x ^ -0.5)]"""
+    a = const(0.5)  # (TODO) type?
+    return [grad * a * power(orig.args[0], negative(a))]
+
+
+@register_gradient("sigmoid")
+def sigmoid_grad(orig, grad):
+    """Returns [grad * sigmoid(x) * (1 - sigmoid(x))]."""
+    return [grad * orig * (ones_like(orig) - orig)]
+
+
+@register_gradient("tanh")
+def tanh_grad(orig, grad):
+    """Returns grad * (1 - tanh(x) * tanh(x))."""
+    return [grad * ones_like(orig) - orig * orig]
+
+
+@register_gradient("nn.relu")
+def relu_grad(orig, grad):
+    """Returns grad * (select(x < 0, 0, 1))."""
+    x = orig.args[0]
+    zeros = zeros_like(x)
+    ones = ones_like(x)
+    return [where(less(x, zeros), zeros, ones * grad)]
+
+
+@register_gradient("add")
+def add_grad(orig, grad):
+    """Returns [grad, grad]"""
+    return [collapse_sum_like(grad, orig.args[0]),
+            collapse_sum_like(grad, orig.args[1])]
+
+
+@register_gradient("subtract")
+def subtract_grad(orig, grad):
+    """Returns [grad, -grad]"""
+    return [collapse_sum_like(grad, orig.args[0]),
+            collapse_sum_like(negative(grad), orig.args[1])]
+
+
+@register_gradient("multiply")
+def multiply_grad(orig, grad):
+    """Returns [grad * y, grad * x]"""
+    x, y = orig.args
+    return [collapse_sum_like(grad * y, x),
+            collapse_sum_like(grad * x, y)]
+
+
+@register_gradient("divide")
+def divide_grad(orig, grad):
+    """Returns [grad / y,  - grad * (x / y) / y]"""
+    x, y = orig.args
+    return [collapse_sum_like(grad / y, x),
+            collapse_sum_like(- (grad * orig / y), y)]
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index abf0b5317b48..2b43c21f8e10 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1,7 +1,6 @@
 """Backend compiler related feature registration"""
 # pylint: disable=invalid-name,unused-argument
 from __future__ import absolute_import
-import topi
 from . import op as _reg
 from ._reduce import _schedule_reduce
 from .op import schedule_injective, OpPattern
@@ -19,6 +18,9 @@
 _reg.register_schedule("reshape_like", schedule_injective)
 _reg.register_schedule("full", schedule_injective)
 _reg.register_schedule("full_like", schedule_injective)
+_reg.register_schedule("arange", schedule_injective)
+_reg.register_schedule("repeat", schedule_broadcast)
+_reg.register_schedule("tile", schedule_broadcast)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
@@ -26,16 +28,10 @@
 _reg.register_schedule("take", schedule_injective)
 _reg.register_schedule("transpose", schedule_injective)
 _reg.register_schedule("where", schedule_broadcast)
+_reg.register_schedule("stack", schedule_injective)
+_reg.register_schedule("concatenate", schedule_injective)
 _reg.register_schedule("_contrib_reverse_reshape", schedule_injective)
 
 # layout_transform
 _reg.register_schedule("layout_transform", schedule_injective)
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
-
-# concatenate
-@_reg.register_compute("concatenate")
-def concatenate_compute(attrs, inputs, output_type, target):
-    return [topi.concatenate(inputs, axis=attrs.axis)]
-
-_reg.register_schedule("concatenate", schedule_injective)
-_reg.register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 5d4cda162ee3..0c2733ecae92 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -46,6 +46,21 @@ def schedule_dense(attrs, outputs, target):
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# batch_matmul
+@reg.register_compute("nn.batch_matmul")
+def compute_batch_matmul(attrs, inputs, out_type, target):
+    """Compute definition of batch_matmul"""
+    return [topi.nn.batch_matmul(inputs[0], inputs[1])]
+
+@reg.register_schedule("nn.batch_matmul")
+def schedule_batch_matmul(attrs, outputs, target):
+    """Schedule definition of batch_matmul"""
+    with target:
+        return topi.generic.schedule_batch_matmul(outputs)
+
+reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # conv2d
 @reg.register_compute("nn.conv2d")
 def compute_conv2d(attrs, inputs, out_type, target):
@@ -57,7 +72,7 @@ def compute_conv2d(attrs, inputs, out_type, target):
     layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
     out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
 
     assert layout in ["NCHW", "NHWC", "NCHW4c"]
@@ -95,15 +110,15 @@ def schedule_conv2d(attrs, outs, target):
     with target:
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NCHW4c":
+        if groups == 1 and layout == "NCHW4c":
             return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NHWC":
+        if groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
-        elif groups != 1:
+        if groups != 1:
             if layout == "NCHW":
                 # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
                 return topi.generic.schedule_depthwise_conv2d_nchw(outs)
-            elif layout == "NHWC" and kernel_layout == "HWOI":
+            if layout == "NHWC" and kernel_layout == "HWOI":
                 return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
     raise ValueError("No compatible schedule")
 
@@ -127,7 +142,7 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
     groups = attrs.groups
     layout = attrs.data_layout
     out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 06cd79a8ff8b..41b2148ec390 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -767,6 +767,31 @@ def batch_norm(data,
     return TupleWrapper(result, 3)
 
 
+def batch_matmul(x, y):
+    r"""
+    Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data
+    in batch.
+
+    .. math::
+
+        \mbox{batch_matmul}(x, y)[i, :, :] = \mbox{matmul}(x[i, :, :], y[i, :, :]^T)
+
+    Parameters
+    ----------
+    x : tvm.relay.Expr
+        The first input.
+
+    y : tvm.relay.Expr
+        The second input.
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        The computed result.
+    """
+    return _make.batch_matmul(x, y)
+
+
 def contrib_conv2d_winograd_without_weight_transform(data,
                                                      weight,
                                                      tile_size,
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index e751a4e5565e..37f1fc1ee2b5 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -168,7 +168,7 @@ def register_pattern(op_name, pattern, level=10):
     """
     return register(op_name, "TOpPattern", pattern, level)
 
-def register_gradient(op_name, fgradient, level=10):
+def register_gradient(op_name, fgradient=None, level=10):
     """Register operator pattern for an op.
 
     Parameters
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index d6d73242bb96..5fa83bd96f30 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -6,19 +6,18 @@
 @register_relay_attr_node
 class Conv2DAttrs(Attrs):
     """Attribute of nn.conv2d"""
-    pass
+
 
 @register_relay_attr_node
 class Conv2DWinogradAttrs(Attrs):
     """Attribute of nn.contrib_conv2d_winograd_without_weight_transform"""
-    pass
+
 
 @register_relay_attr_node
 class Conv2DWinogradWeightTransformAttrs(Attrs):
     """Attribute of nn.contrib_conv2d_winograd_weight_transform"""
-    pass
+
 
 @register_relay_attr_node
 class GlobalPool2DAttrs(Attrs):
     """Attribute of nn.global_pool"""
-    pass
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index a6247dd971a8..e315f27dc593 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -191,6 +191,22 @@ def negative(data):
     return _make.negative(data)
 
 
+def logical_not(data):
+    """Compute element-wise logical not of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_not(data)
+
+
 def add(lhs, rhs):
     """Addition with numpy-style broadcasting.
 
@@ -307,6 +323,42 @@ def mod(lhs, rhs):
     return _make.mod(lhs, rhs)
 
 
+def logical_and(lhs, rhs):
+    """logical AND with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_and(lhs, rhs)
+
+
+def logical_or(lhs, rhs):
+    """logical OR with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_or(lhs, rhs)
+
+
 def equal(lhs, rhs):
     """Broadcasted elementwise test for (lhs == rhs).
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 78efd3cfd4d9..b77269843c91 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -166,8 +166,9 @@ def reshape_like(data, shape_like):
     """Reshapes the input array by the size of another array.
     For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
     the input array into an output array with the same shape as the second input array.
+
     .. note::
-    Sizes for both array should be compatible.
+        Sizes for both array should be compatible.
 
     Parameters
     ----------
@@ -249,10 +250,148 @@ def full_like(data, fill_value):
     return _make.full_like(data, fill_value)
 
 
+def arange(start, stop=None, step=1, dtype="float32"):
+    """Return evenly spaced values within a given interval.
+
+    .. note::
+        Similar to ``numpy.arange``, when only one argument is given, it is used
+        as `stop` instead of `start` while `start` takes default value 0.
+
+        Warning: Undefined behavior when dtype is incompatible with start/stop/step.
+        It could lead to different results compared to numpy, MXNet, pytorch, etc.
+
+    Parameters
+    ----------
+    start : tvm.Expr, optional
+        Start of interval. The interval includes this value. The default start
+        value is 0.
+
+    stop : tvm.Expr
+        Stop of interval. The interval does not include this value.
+
+    step : tvm.Expr, optional
+        Spacing between values. The default step size is 1.
+
+    dtype : str, optional
+        The target data type.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        relay.arange(5) = [0, 1, 2, 3, 4]
+        relay.arange(1, 5) = [1, 2, 3, 4]
+        relay.arange(1, 5, 1.5) = [1, 2.5, 4]
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _make.arange(start, stop, step, dtype)
+
+
+def stack(data, axis):
+    """Join a sequence of arrays along a new axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int
+        The axis in the result array along which the input arrays are stacked.
+
+    .. note::
+        Each array in the input array sequence must have the same shape.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.stack(data, axis)
+
+
+def repeat(data, repeats, axis):
+    """Repeats elements of an array.
+    By default, repeat flattens the input array into 1-D and then repeats the elements.
+
+    repeats : int
+        The number of repetitions for each element.
+
+    axis: int
+        The axis along which to repeat values. The negative numbers are interpreted
+        counting from the backward. By default, use the flattened input array, and
+        return a flat output array.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        relay.repeat(x, repeats=2) = [1., 1., 2., 2., 3., 3., 4., 4.]
+
+        relay.repeat(x, repeats=2, axis=1) = [[1., 1., 2., 2.],
+                                              [3., 3., 4., 4.]]
+    """
+    return _make.repeat(data, repeats, axis)
+
+
+def tile(data, reps):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    reps : tuple of int
+        The number of times repeating the tensor data.
+
+    .. note::
+        Each dim size of reps must be a positive integer. If reps has length d,
+        the result will have dimension of max(d, data.ndim); If data.ndim < d,
+        data is promoted to be d-dimensional by prepending new axes.
+        If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        relay.tile(x, reps=(2,3)) = [[1., 2., 1., 2., 1., 2.],
+                                     [3., 4., 3., 4., 3., 4.],
+                                     [1., 2., 1., 2., 1., 2.],
+                                     [3., 4., 3., 4., 3., 4.]]
+
+        relay.tile(x, reps=(2,)) = [[1., 2., 1., 2.],
+                                    [3., 4., 3., 4.]]
+    """
+
+    return _make.tile(data, reps)
+
+
 def where(condition, x, y):
     """Selecting elements from either x or y depending on the value of the
     condition.
 
+    .. note::
+        The shape of condition, x, and y needs to be the same.
+
     Parameters
     ----------
     condition : relay.Expr
@@ -282,8 +421,6 @@ def where(condition, x, y):
 
         condition = [1, 0]
         relay.where(conditon, x, y) = [[1, 2], [7, 8]]
-
-    Note that the shape of condition, x, and y needs to be the same.
     """
     return _make.where(condition, x, y)
 
@@ -388,7 +525,7 @@ def strided_slice(data, begin, end, strides=None):
         The indices to begin with in the slicing.
 
     end: list of int
-        Indicies indicating end of the slice.
+        Indices indicating end of the slice.
 
     strides: list of int, optional
         Specifies the stride values, it can be negative in that case,
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 710adfeb4955..0cee4e4faeec 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -5,5 +5,7 @@
 from .multibox import *
 from .nms import *
 from .rcnn import *
-from . import _multibox
+from .yolo import *
 from . import _rcnn
+from . import _yolo
+from . import _vision
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index 2617bf8562b9..9606ee64c7be 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -1,7 +1,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Faster R-CNN and Mask R-CNN operations."""
 import topi
-from topi.util import get_const_tuple
+from topi.util import get_const_tuple, get_float_tuple, get_const_int
 from .. import op as reg
 from ..op import OpPattern
 
@@ -21,3 +21,29 @@ def schedule_roi_align(_, outs, target):
         return topi.generic.vision.schedule_roi_align(outs)
 
 reg.register_pattern("vision.roi_align", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+@reg.register_compute("vision.proposal")
+def compute_proposal(attrs, inputs, _, target):
+    """Compute definition of proposal"""
+    scales = get_float_tuple(attrs.scales)
+    ratios = get_float_tuple(attrs.ratios)
+    feature_stride = attrs.feature_stride
+    threshold = attrs.threshold
+    rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
+    rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
+    rpn_min_size = attrs.rpn_min_size
+    iou_loss = bool(get_const_int(attrs.iou_loss))
+    with target:
+        return [
+            topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios,
+                                      feature_stride, threshold, rpn_pre_nms_top_n,
+                                      rpn_post_nms_top_n, rpn_min_size, iou_loss)
+        ]
+
+@reg.register_schedule("vision.proposal")
+def schedule_proposal(_, outs, target):
+    """Schedule definition of proposal"""
+    with target:
+        return topi.generic.schedule_proposal(outs)
+
+reg.register_pattern("vision.proposal", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_multibox.py b/python/tvm/relay/op/vision/_vision.py
similarity index 62%
rename from python/tvm/relay/op/vision/_multibox.py
rename to python/tvm/relay/op/vision/_vision.py
index e9ef43f7e06f..c887076e6af8 100644
--- a/python/tvm/relay/op/vision/_multibox.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -54,24 +54,46 @@ def compute_multibox_transform_loc(attrs, inputs, _, target):
 reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
 
 
+# Get counts of valid boxes
+@reg.register_schedule("vision.get_valid_counts")
+def schedule_get_valid_counts(_, outs, target):
+    """Schedule definition of get_valid_counts"""
+    with target:
+        return topi.generic.schedule_get_valid_counts(outs)
+
+
+@reg.register_compute("vision.get_valid_counts")
+def compute_get_valid_counts(attrs, inputs, _, target):
+    """Compute definition of get_valid_counts"""
+    score_threshold = get_const_float(attrs.score_threshold)
+    return topi.vision.get_valid_counts(inputs[0], score_threshold)
+
+reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
+
+
 # non-maximum suppression
-@reg.register_schedule("vision.nms")
+@reg.register_schedule("vision.non_max_suppression")
 def schedule_nms(_, outs, target):
     """Schedule definition of nms"""
     with target:
         return topi.generic.schedule_nms(outs)
 
 
-@reg.register_compute("vision.nms")
+@reg.register_compute("vision.non_max_suppression")
 def compute_nms(attrs, inputs, _, target):
     """Compute definition of nms"""
-    overlap_threshold = get_const_float(attrs.overlap_threshold)
+    return_indices = bool(get_const_int(attrs.return_indices))
+    max_output_size = get_const_int(attrs.max_output_size)
+    iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
-    topk = get_const_int(attrs.topk)
+    top_k = get_const_int(attrs.top_k)
+    id_index = get_const_int(attrs.id_index)
+    invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
-        topi.vision.nms(inputs[0], inputs[1], overlap_threshold,
-                        force_suppress, topk)
+        topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                        iou_threshold, force_suppress, top_k,
+                                        id_index, return_indices, invalid_to_bottom)
     ]
 
 
-reg.register_pattern("vision.nms", OpPattern.OPAQUE)
+reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py
new file mode 100644
index 000000000000..749ebfa26dd0
--- /dev/null
+++ b/python/tvm/relay/op/vision/_yolo.py
@@ -0,0 +1,9 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..op import  register_schedule, register_pattern
+from ..op import schedule_injective, OpPattern
+
+# reorg
+register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE)
+register_schedule("vision.yolo_reorg", schedule_injective)
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 8035e3030b17..0124ee29ab9e 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -1,12 +1,41 @@
 """Non-maximum suppression operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ...expr import TupleWrapper
 
-def nms(data,
-        valid_count,
-        overlap_threshold=0.5,
-        force_suppress=False,
-        topk=-1):
+def get_valid_counts(data,
+                     score_threshold):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    valid_count : relay.Expr
+        1-D tensor for valid number of boxes.
+
+    out_tensor : relay.Expr
+        Rearranged data tensor.
+    """
+    return TupleWrapper(_make.get_valid_counts(data, score_threshold), 2)
+
+
+def non_max_suppression(data,
+                        valid_count,
+                        max_output_size=-1,
+                        iou_threshold=0.5,
+                        force_suppress=False,
+                        top_k=-1,
+                        id_index=0,
+                        return_indices=True,
+                        invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -19,18 +48,33 @@ def nms(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
-    overlap_threshold : float, optional
+    max_output_size : int, optional
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : float, optional
         Non-maximum suppression threshold.
 
     force_suppress : bool, optional
         Suppress all detections regardless of class_id.
 
-    topk : int, optional
+    top_k : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : int, optional
+        index of the class categories, -1 to disable.
+
+    return_indices : bool, optional
+        Whether to return box indices in input data.
+
+    invalid_to_bottom : bool, optional
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : relay.Expr
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
-    return _make.nms(data, valid_count, overlap_threshold, force_suppress, topk)
+    return _make.non_max_suppression(data, valid_count, max_output_size,
+                                     iou_threshold, force_suppress, top_k,
+                                     id_index, return_indices, invalid_to_bottom)
diff --git a/python/tvm/relay/op/vision/rcnn.py b/python/tvm/relay/op/vision/rcnn.py
index 8bbafbe75c53..8e95435d0ecc 100644
--- a/python/tvm/relay/op/vision/rcnn.py
+++ b/python/tvm/relay/op/vision/rcnn.py
@@ -30,3 +30,63 @@ def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout='N
         4-D tensor with shape [num_roi, channel, pooled_size, pooled_size]
     """
     return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout)
+
+
+def proposal(cls_prob,
+             bbox_pred,
+             im_info,
+             scales,
+             ratios,
+             feature_stride,
+             threshold,
+             rpn_pre_nms_top_n,
+             rpn_post_nms_top_n,
+             rpn_min_size,
+             iou_loss):
+    """Proposal operator.
+
+    Parameters
+    ----------
+    cls_prob : relay.Expr
+        4-D tensor with shape [batch, 2 * num_anchors, height, width].
+
+    bbox_pred : relay.Expr
+        4-D tensor with shape [batch, 4 * num_anchors, height, width].
+
+    im_info : relay.Expr
+        2-D tensor with shape [batch, 3]. The last dimension should be in format of
+        [im_height, im_width, im_scale]
+
+    scales : list/tuple of float
+        Scales of anchor windoes.
+
+    ratios : list/tuple of float
+        Ratios of anchor windoes.
+
+    feature_stride : int
+        The size of the receptive field each unit in the convolution layer of the rpn, for example
+        the product of all stride's prior to this layer.
+
+    threshold : float
+        Non-maximum suppression threshold.
+
+    rpn_pre_nms_top_n : int
+        Number of top scoring boxes to apply NMS. -1 to use all boxes.
+
+    rpn_post_nms_top_n : int
+        Number of top scoring boxes to keep after applying NMS to RPN proposals.
+
+    rpn_min_size : int
+        Minimum height or width in proposal.
+
+    iou_loss : bool
+        Usage of IoU loss.
+
+    Returns
+    -------
+    output : relay.Expr
+        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
+        [batch_index, w_start, h_start, w_end, h_end].
+    """
+    return _make.proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
+                          rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss)
diff --git a/python/tvm/relay/op/vision/yolo.py b/python/tvm/relay/op/vision/yolo.py
new file mode 100644
index 000000000000..71b7918dca0f
--- /dev/null
+++ b/python/tvm/relay/op/vision/yolo.py
@@ -0,0 +1,34 @@
+"""Yolo operations."""
+from . import _make
+
+def yolo_reorg(data, stride):
+    """Yolo reorg operation used in darknet models.
+    This layer shuffles the input tensor values based on the stride value.
+    Along with the shuffling, it does the shape transform.
+    If '(n, c, h, w)' is the data shape and 's' is stride, output shape is '(n, c*s*s, h/s, w/s)'
+    Example: data(1, 4, 2, 2) = [[[[ 0  1] [ 2  3]]
+                                  [[ 4  5] [ 6  7]]
+                                  [[ 8  9] [10 11]]
+                                  [[12 13] [14 15]]]]
+             stride = 2
+             ret(1, 16, 1, 1) = [[[[ 0]]  [[ 2]]  [[ 8]]  [[10]]
+                                  [[ 1]]  [[ 3]]  [[ 9]]  [[11]]
+                                  [[ 4]]  [[ 6]]  [[12]]  [[14]]
+                                  [[ 5]]  [[ 7]]  [[13]]  [[15]]]]
+
+    Note: stride=1 has no significance for reorg operation.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    stride : int
+        The stride value for reorganisation.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.yolo_reorg(data, stride)
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
new file mode 100644
index 000000000000..f7647beadeb2
--- /dev/null
+++ b/python/tvm/relay/param_dict.py
@@ -0,0 +1,60 @@
+# pylint: disable=invalid-name
+"""Helper utility to save parameter dicts."""
+import tvm
+
+_save_param_dict = tvm.get_global_func("tvm.relay._save_param_dict")
+_load_param_dict = tvm.get_global_func("tvm.relay._load_param_dict")
+
+def save_param_dict(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Examples
+    --------
+    .. code-block:: python
+
+       # compile and save the modules to file.
+       graph, lib, params = tvm.relay.build(func, target=target, params=params)
+       module = graph_runtime.create(graph, lib, tvm.gpu(0))
+       # save the parameters as byte array
+       param_bytes = tvm.relay.save_param_dict(params)
+       # We can serialize the param_bytes and load it back later.
+       # Pass in byte array to module to directly set parameters
+       module.load_params(param_bytes)
+    """
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_param_dict(*args)
+
+
+def load_param_dict(param_bytes):
+    """Load parameter dictionary to binary bytes.
+
+    Parameters
+    ----------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Returns
+    -------
+    params : dict of str to NDArray
+        The parameter dictionary.
+    """
+    if isinstance(param_bytes, (bytes, str)):
+        param_bytes = bytearray(param_bytes)
+    load_arr = _load_param_dict(param_bytes)
+    return {v.name : v.array for v in load_arr}
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 99b6c8d1c766..41d1be284f8e 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -340,7 +340,10 @@ def define_tree_map(self):
                                        Match(t, [rose_case]), self.tree(b), [a, b])
 
     def define_tree_size(self):
-        """Defines a function that computes the size of a tree as a nat."""
+        """Defines a function that computes the size of a tree as a nat.
+
+        Signature: fn<a>(t : tree[a]) -> nat
+        """
         self.size = GlobalVar("size")
         a = TypeVar("a")
         t = Var("t", self.tree(a))
@@ -351,6 +354,54 @@ def define_tree_size(self):
         self.mod[self.size] = Function([t],
                                        Match(t, [rose_case]), self.nat(), [a])
 
+    def define_id(self):
+        """Defines a function that return it's argument.
+
+        Signature: fn<a>(x : a) -> a
+        """
+        self.id = GlobalVar("id")
+        a = TypeVar("a")
+        x = Var("x", a)
+        self.mod[self.id] = Function([x], x, a, [a])
+
+
+    def define_compose(self):
+        """Defines a function that compose two function.
+
+        Signature: fn<a, b, c>(f : fn(b) -> c, g : fn(a) -> b) -> fn(a) -> c
+        """
+        self.compose = GlobalVar("compose")
+        a = TypeVar("a")
+        b = TypeVar("b")
+        c = TypeVar("c")
+        f = Var("f", FuncType([b], c))
+        g = Var("g", FuncType([a], b))
+        x = Var("x")
+        self.mod[self.compose] = Function([f, g],
+                                          Function([x], f(g(x))),
+                                          FuncType([a], c),
+                                          [a, b, c])
+
+
+    def define_iterate(self):
+        """Define a function that take a number n, a function f,
+        and return a closure that apply f n time on it's argument.
+
+        Signature: fn<a>(n : nat, f : fn(a) -> a) -> fn(a) -> a
+        """
+        self.iterate = GlobalVar("iterate")
+        a = TypeVar("a")
+        f = Var("f", FuncType([a], a))
+        x = Var("x", self.nat())
+        y = Var("y", self.nat())
+        z_case = Clause(PatternConstructor(self.z), self.id)
+        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
+                        self.compose(f, self.iterate(f, y)))
+        self.mod[self.iterate] = Function([f, x],
+                                          Match(x, [z_case, s_case]),
+                                          FuncType([a], a),
+                                          [a])
+
     def __init__(self, mod):
         self.mod = mod
         self.define_list_adt()
@@ -377,3 +428,7 @@ def __init__(self, mod):
         self.define_tree_adt()
         self.define_tree_map()
         self.define_tree_size()
+
+        self.define_id()
+        self.define_compose()
+        self.define_iterate()
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 7eb8af57a70b..5daf10284a9d 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -1,6 +1,7 @@
 #pylint: disable=unused-argument
 """Internal module for registering attribute for annotation."""
 from __future__ import absolute_import
+import warnings
 
 import topi
 from . import _quantize
@@ -118,6 +119,13 @@ def attach_simulated_quantize(data, kind, sign=True, rounding="round"):
         data, dom_scale, clip_min, clip_max, kind, sign, rounding)
 
 
+@register_annotate_function("nn.contrib_conv2d_NCHWc")
+def conv2d_nchwc_rewrite(ref_call, new_args, ctx):
+    warnings.warn("NCHWc layout Conv2D detected, please use a lower "
+                  "optimization level before applying the quantization "
+                  "pass as quantization will have no effect here...")
+
+
 @register_annotate_function("nn.conv2d")
 def conv2d_rewrite(ref_call, new_args, ctx):
     """Rewrite function for conv2d. Lhs of conv will be quantized to
@@ -184,6 +192,9 @@ def add_rewrite(ref_call, new_args, ctx):
         else:
             # quantize rhs to INPUT field if it is not Constant
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
+    if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
+        # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION
+        rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 6756090f14a7..56e0f586fc1f 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -58,6 +58,7 @@ class QConfig(NodeBase):
         "round_for_shift": True,
         "store_lowbit_output": True,
         "debug_enabled_ops": None,
+        "use_stop_fusion": True
     }
 
     # pylint: disable=no-member
@@ -129,6 +130,10 @@ def qconfig(**kwargs):
         Whether to store low-bit integer back as output before dequantizing.
         Some accelerators need this, e.g. VTA.
 
+    use_stop_fusion: boolean
+        Whether add stop_fusion when casting to dtype_activation. stop_fusion forces lowbit
+        results to be stored in memory.
+
     Returns
     -------
     config: QConfig
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
index 491b221fbe0a..7ac3ca35a0bd 100644
--- a/python/tvm/relay/testing/inception_v3.py
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -29,11 +29,10 @@ def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None,
 def Pooling(data, kernel, stride, pad, pool_type, name):
     if pool_type == 'max':
         return relay.nn.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad)
-    elif pool_type == 'avg':
+    if pool_type == 'avg':
         return relay.nn.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad,
                                    count_include_pad=True)
-    else:
-        raise ValueError("Invalid pooling type: " + pool_type)
+    raise ValueError("Invalid pooling type: " + pool_type)
 
 def Inception7A(data,
                 num_1x1,
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 1cfa96aa7213..96ade4124a00 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -172,7 +172,6 @@ def __init__(self, func, args):
 @register_relay_node
 class TypeConstraint(Type):
     """Abstract class representing a type constraint."""
-    pass
 
 
 @register_relay_node
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index c975ec64aa76..cf2e118f06c4 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -9,7 +9,7 @@
 from . import base
 from ..contrib import util
 from .._ffi.base import TVMError
-from .._ffi import function as function
+from .._ffi import function
 from .._ffi import ndarray as nd
 from ..module import load as _load_module
 
diff --git a/python/tvm/rpc/proxy.py b/python/tvm/rpc/proxy.py
index cefffbfa9668..7f01fb1b7b02 100644
--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -389,7 +389,7 @@ def _handler_ready_proxy_mode(self, handler):
         if key in pool_src:
             self._pair_up(pool_src.pop(key), handler)
             return
-        elif key not in pool_dst:
+        if key not in pool_dst:
             pool_dst[key] = handler
             def cleanup():
                 """Cleanup client connection if timeout"""
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index f0cdd14abce6..2bc1ae7fde92 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -331,6 +331,15 @@ def __init__(self,
             if silent:
                 cmd += ["--silent"]
 
+            # prexec_fn is not thread safe and may result in deadlock.
+            # python 3.2 introduced the start_new_session parameter as
+            # an alternative to the common use case of
+            # prexec_fn=os.setsid.  Once the minimum version of python
+            # supported by TVM reaches python 3.2 this code can be
+            # rewritten in favour of start_new_session.  In the
+            # interim, stop the pylint diagnostic.
+            #
+            # pylint: disable=subprocess-popen-preexec-fn
             self.proc = subprocess.Popen(cmd, preexec_fn=os.setsid)
             time.sleep(0.5)
         elif not is_proxy:
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
index eafea2e85394..cc0398182a0e 100644
--- a/python/tvm/rpc/tornado_util.py
+++ b/python/tvm/rpc/tornado_util.py
@@ -95,9 +95,8 @@ def _update_read(self):
             if msg:
                 self.on_message(msg)
                 return True
-            else:
-                # normal close, remote is closed
-                self.close()
+            # normal close, remote is closed
+            self.close()
         except socket.error as err:
             if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
                 pass
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 1a06ed81ae4f..5644775ca416 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -86,7 +86,7 @@ def remove(self, value):
         value: object
             The resource to remove
         """
-        pass
+
 
     def summary(self):
         """Get summary information of the scheduler."""
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index 6c261a453457..e772735b5bfb 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -143,19 +143,16 @@ def vstore(self, begin, value):
 @register_node
 class Split(NodeBase):
     """Split operation on axis."""
-    pass
 
 
 @register_node
 class Fuse(NodeBase):
     """Fuse operation on axis."""
-    pass
 
 
 @register_node
 class Singleton(NodeBase):
     """Singleton axis."""
-    pass
 
 
 @register_node
diff --git a/python/tvm/stmt.py b/python/tvm/stmt.py
index 48d91dfa8044..f06958ab78ee 100644
--- a/python/tvm/stmt.py
+++ b/python/tvm/stmt.py
@@ -381,7 +381,7 @@ def stmt_list(stmt):
     """
     if isinstance(stmt, Block):
         return stmt_list(stmt.first) + stmt_list(stmt.rest)
-    elif isinstance(stmt, ProducerConsumer):
+    if isinstance(stmt, ProducerConsumer):
         return stmt_list(stmt.body)
     return [stmt]
 
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index e1345ad373bf..a9c862a268cf 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -33,7 +33,6 @@ def dtype(self):
 @register_node
 class TensorIntrinCall(NodeBase):
     """Intermediate structure for calling a tensor intrinsic."""
-    pass
 
 
 itervar_cls = None
@@ -144,11 +143,10 @@ def input_tensors(self):
 @register_node
 class PlaceholderOp(Operation):
     """Placeholder operation."""
-    pass
 
 
 @register_node
-class ComputeOp(Operation):
+class BaseComputeOp(Operation):
     """Compute operation."""
     @property
     def axis(self):
@@ -162,11 +160,16 @@ def reduce_axis(self):
 
 
 @register_node
-class TensorComputeOp(Operation):
-    """Tensor operation."""
+class ComputeOp(BaseComputeOp):
+    """Scalar operation."""
     pass
 
 
+@register_node
+class TensorComputeOp(BaseComputeOp):
+    """Tensor operation."""
+
+
 @register_node
 class ScanOp(Operation):
     """Scan operation."""
@@ -179,7 +182,7 @@ def scan_axis(self):
 @register_node
 class ExternOp(Operation):
     """Extern operation."""
-    pass
+
 
 @register_node
 class HybridOp(Operation):
@@ -188,3 +191,142 @@ class HybridOp(Operation):
     def axis(self):
         """Represent axis of IterVar, also defined when it is a HybridOp"""
         return self.__getattr__("axis")
+
+
+@register_node
+class Layout(NodeBase):
+    """Layout is composed of upper cases, lower cases and numbers,
+    where upper case indicates a primal axis and
+    the corresponding lower case with factor size indicates the subordinate axis.
+    For example, NCHW16c can describe a 5-D tensor of
+    [batch_size, channel, height, width, channel_block].
+    Here subordinate axis channel_block=16 is the factor size of the primal axis C (channel).
+
+    Do not construct directly, use :any:`layout` instead.
+    See the documentation of :any:`layout` for more details.
+
+    See Also
+    --------
+    layout : Declare a layout
+    """
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return "Layout(" + self.name + ")"
+
+    def __len__(self):
+        return _api_internal._LayoutNdim(self)
+
+    def __contains__(self, axis):
+        return len(axis) == 1 and axis[0].isalpha() and axis[0] in self.name
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Layout index out of range")
+        return _api_internal._LayoutGetItem(self, index)
+
+    def index_of(self, axis):
+        """Get the index of an axis
+
+        Parameters
+        ----------
+        axis : str
+            The axis name, need to be [a-z,A-Z]
+
+        Returns
+        -------
+        index : int
+            The index of the axis, -1 if not found.
+        """
+        return _api_internal._LayoutIndexOf(self, axis)
+
+    def factor_of(self, axis):
+        """Get the factor size of the subordinate axis.
+
+        Parameters
+        ----------
+        axis : str
+            The axis name, need to be [a-z,A-Z]
+
+        Returns
+        -------
+        factor : int
+            the size of the subordinate-axis of axis (if axis is a primal-axis),
+            or the size of axis itself (if axis is a subordinate-axis).
+            Return -1 if axis is not in the layout.
+        """
+        return _api_internal._LayoutFactorOf(self, axis)
+
+
+@register_node
+class BijectiveLayout(NodeBase):
+    """Bijective mapping for two layouts (src-layout and dst-layout).
+    It provides shape and index conversion between each other.
+
+    Do not construct directly, use :any:`bijective_layout` instead.
+    See the documentation of :any:`bijective_layout` for more details.
+
+    See Also
+    --------
+    bijective_layout : Declare a bijective layout converter
+    """
+    def forward_index(self, index):
+        """Given the indices of the src-layout, infer the dst index.
+
+        Parameters
+        ----------
+        index: Array of Expr
+            The indices in src-layout.
+
+        Returns
+        -------
+        dst_index: Array of Expr
+            The inferred indices in dst-layout.
+        """
+        return _api_internal._BijectiveLayoutForwardIndex(self, index)
+
+    def backward_index(self, index):
+        """Given the indices of the dst-layout, infer the src index.
+
+        Parameters
+        ----------
+        index: Array of Expr
+            The indices in dst-layout.
+
+        Returns
+        -------
+        src_index: Array of Expr
+            The inferred indices in src-layout.
+        """
+        return _api_internal._BijectiveLayoutBackwardIndex(self, index)
+
+    def forward_shape(self, shape):
+        """Given the shape of the src-layout, infer the dst shape.
+
+        Parameters
+        ----------
+        shape: Array of Expr
+            The shape in src-layout.
+
+        Returns
+        -------
+        dst_shape: Array of Expr
+            The inferred shape in dst-layout.
+        """
+        return _api_internal._BijectiveLayoutForwardShape(self, shape)
+
+    def backward_shape(self, shape):
+        """Given the shape of the dst-layout, infer the src shape.
+
+        Parameters
+        ----------
+        shape: Array of Expr
+            The shape in dst-layout.
+
+        Returns
+        -------
+        src_shape: Array of Expr
+            The inferred shape in src-layout.
+        """
+        return _api_internal._BijectiveLayoutBackwardShape(self, shape)
diff --git a/rust/frontend/README.md b/rust/frontend/README.md
index 5bd4362aefc4..9f46cf760c91 100644
--- a/rust/frontend/README.md
+++ b/rust/frontend/README.md
@@ -215,5 +215,5 @@ fn main() {
         .unwrap();
 
     assert_eq!(ret, 14f64);
-    }
+}
 ```
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index 31ff5ccb3a15..cc7d814617a9 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -26,11 +26,6 @@ TVM_REGISTER_API("arith.intset_interval")
     *ret = IntSet::interval(args[0], args[1]);
   });
 
-TVM_REGISTER_API("arith.EvalModular")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = EvalModular(args[0], Map<Var, IntSet>());
-  });
-
 TVM_REGISTER_API("arith.DetectLinearEquation")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = DetectLinearEquation(args[0], args[1]);
@@ -75,5 +70,63 @@ TVM_REGISTER_API("_IntSetIsEverything")
     *ret = args[0].operator IntSet().is_everything();
   });
 
+TVM_REGISTER_API("arith._make_ConstIntBound")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ConstIntBoundNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("arith._make_ModularSet")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ModularSetNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("arith._CreateAnalyzer")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    using runtime::PackedFunc;
+    using runtime::TypedPackedFunc;
+    auto self = std::make_shared<Analyzer>();
+    auto f = [self](std::string name) -> PackedFunc {
+      if (name == "const_int_bound") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->const_int_bound(args[0]);
+          });
+      } else if (name == "modular_set") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->modular_set(args[0]);
+        });
+      } else if (name == "const_int_bound_update") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            self->const_int_bound.Update(args[0], args[1], args[2]);
+        });
+      } else if (name == "rewrite_simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->rewrite_simplify(args[0]);
+        });
+      } else if (name == "bind") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            auto& sptr = args[1].node_sptr();
+            if (sptr->is_type<Range::ContainerType>()) {
+              self->Bind(args[0], args[1].operator Range());
+            } else {
+              self->Bind(args[0], args[1].operator Expr());
+            }
+        });
+      } else if (name == "enter_constraint_context") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            // can't use make_shared due to noexcept(false) decl in destructor,
+            // see https://stackoverflow.com/a/43907314
+            auto ctx =
+                std::shared_ptr<ConstraintContext>(new ConstraintContext(self.get(), args[0]));
+            auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
+              ctx.reset();
+            };
+            *ret = PackedFunc(fexit);
+        });
+      }
+      return PackedFunc();
+    };
+    *ret = TypedPackedFunc<PackedFunc(std::string)>(f);
+});
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index fa2d52e9fe85..a4c7842ffe90 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -5,9 +5,8 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
 #include <tvm/api_registry.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 
 namespace tvm {
 namespace ir {
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index e30111e938bd..50f81644b0b5 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -11,6 +11,7 @@
 #include <tvm/schedule.h>
 #include <tvm/api_registry.h>
 #include <tvm/build_module.h>
+#include <tvm/data_layout.h>
 
 namespace tvm {
 
@@ -224,6 +225,63 @@ TVM_REGISTER_API("_BufferVStore")
         .vstore(args[1], args[2]);
   });
 
+TVM_REGISTER_API("_Layout")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = LayoutNode::make(args[0]);
+  });
+
+TVM_REGISTER_API("_LayoutIndexOf")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = args[0].operator Layout()
+      .IndexOf(LayoutAxis::make(args[1]));
+});
+
+TVM_REGISTER_API("_LayoutFactorOf")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = args[0].operator Layout()
+      .FactorOf(LayoutAxis::make(args[1]));
+});
+
+TVM_REGISTER_API("_LayoutNdim")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = static_cast<int64_t>(args[0].operator Layout().ndim());
+});
+
+TVM_REGISTER_API("_LayoutGetItem")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  const LayoutAxis& axis = args[0].operator Layout()[args[1]];
+  *ret = axis.name();
+});
+
+TVM_REGISTER_API("_BijectiveLayout")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = BijectiveLayoutNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutForwardIndex")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .ForwardIndex(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutBackwardIndex")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .BackwardIndex(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutForwardShape")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .ForwardShape(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutBackwardShape")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .BackwardShape(args[1]);
+  });
+
 TVM_REGISTER_API("_Tensor")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = TensorNode::make(args[0],
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
new file mode 100644
index 000000000000..81195eba2747
--- /dev/null
+++ b/src/arithmetic/analyzer.cc
@@ -0,0 +1,51 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/arithmetic/analyzer.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/arithmetic.h>
+
+namespace tvm {
+namespace arith {
+
+Analyzer::Analyzer()
+    : const_int_bound(this),
+      modular_set(this),
+      rewrite_simplify(this) {
+}
+
+void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
+  Var var(v.node_);
+  this->const_int_bound.Update(var, this->const_int_bound(expr));
+  this->modular_set.Update(var, this->modular_set(expr));
+  this->rewrite_simplify.Update(var, this->rewrite_simplify(expr));
+}
+
+void Analyzer::Bind(const VarExpr& v, const Range& range) {
+  Var var(v.node_);
+  this->const_int_bound.Bind(var, range);
+  // skip modular_set
+  // skip rewrite simplify
+}
+
+ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint) {
+  // entering the scope.
+  auto f0 = analyzer->const_int_bound.EnterConstraint(constraint);
+  auto f1 = analyzer->modular_set.EnterConstraint(constraint);
+  // recovery function.
+  exit_ = [f0, f1]() {
+    if (f1 != nullptr) f1();
+    if (f0 != nullptr) f0();
+  };
+}
+
+bool Analyzer::CanProveGreaterEqual(const Expr& expr, int64_t lower_bound) {
+  if (const auto* ptr = expr.as<ir::IntImm>()) {
+    return ptr->value > lower_bound;
+  }
+  auto bd = this->const_int_bound(this->rewrite_simplify(expr));
+  if (bd->min_value >= lower_bound) return true;
+  return false;
+}
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
new file mode 100644
index 000000000000..4c247c8a7b59
--- /dev/null
+++ b/src/arithmetic/const_fold.h
@@ -0,0 +1,291 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file const_fold.h
+ * \brief Centralized location for constant folding.
+ */
+#ifndef TVM_ARITHMETIC_CONST_FOLD_H_
+#define TVM_ARITHMETIC_CONST_FOLD_H_
+
+#include <tvm/ir.h>
+#include <algorithm>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Try to run binary compute with constant folding.
+ *
+ * \param a The left operand.
+ * \param b The right operand.
+ * \tparam Op The operator type.
+ *
+ * \note a and b Must already matched data types with each other.
+ * \return nullptr if constant fold fails, otherwise return folded result.
+ */
+template<typename Op>
+inline Expr TryConstFold(Expr a, Expr b) {
+  return Expr();
+}
+
+/*!
+ * \brief Try to run unary compute with constant folding.
+ *
+ * \param a The left operand.
+ * \tparam Op The operator type.
+ *
+ * \note a and b Must already matched data types with each other.
+ * \return nullptr if constant fold fails, otherwise return folded result.
+ */
+template<typename Op>
+inline Expr TryConstFold(Expr a);
+
+/*!
+ * \brief Check whether type is used to represent index.
+ *
+ * Index types are frequently used in shape computation
+ * and need to be aggressively constant-folded.
+ *
+ * \param type The type to represent index.
+ * \return the checked result.
+ */
+inline bool IsIndexType(const Type& type) {
+  return type.is_int() && type.lanes() == 1 &&
+      (type.bits() == 32 || type.bits() == 64);
+}
+
+
+#define TVM_ARITH_CONST_PROPAGATION(BODY)                               \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  using ir::FloatImm;                                                   \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const FloatImm* fa = a.as<FloatImm>();                                \
+  const FloatImm* fb = b.as<FloatImm>();                                \
+  BODY;
+
+
+#define TVM_INDEX_CONST_PROPAGATION(BODY)                               \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const Type& ta = a.type();                                            \
+  const Type& tb = b.type();                                            \
+  if (arith::IsIndexType(ta) && arith::IsIndexType(tb)) {               \
+    BODY;                                                               \
+  }                                                                     \
+
+
+// specialization of constant folders.
+template<>
+inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
+      if (pa && pa->value == 0) return b;
+      if (pb && pb->value == 0) return a;
+      if (fa && fb) return FloatImm::make(rtype, fa->value + fb->value);
+      if (fa && fa->value == 0) return b;
+      if (fb && fb->value == 0) return a;
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Sub>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
+      if (pb && pb->value == 0) return a;
+      if (fa && fb) return FloatImm::make(rtype, fa->value - fb->value);
+      if (fb && fb->value == 0) return a;
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Mul>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
+      if (pa) {
+        if (pa->value == 1) return b;
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return a;
+        if (pb->value == 0) return b;
+      }
+      if (fa && fb) return FloatImm::make(rtype, fa->value * fb->value);
+      if (fa) {
+        if (fa->value == 1) return b;
+        if (fa->value == 0) return a;
+      }
+      if (fb) {
+        if (fb->value == 1) return a;
+        if (fb->value == 0) return b;
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Div>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value / pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return a;
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+      if (fa && fb && fb->value != 0) {
+        return FloatImm::make(rtype, fa->value / fb->value);
+      }
+      if (fa && fa->value == 0) return a;
+      if (fb) {
+        if (fb->value == 1) return a;
+        CHECK_NE(fb->value, 0) << "Divide by zero";
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Mod>(Expr a, Expr b) {
+  TVM_INDEX_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value % pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return make_zero(rtype);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Min>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
+      if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Max>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
+      if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::GT>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value > fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::GE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value >= fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::LT>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value < fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::LE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value <= fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::EQ>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value == fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::NE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value != fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::And>(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pa->value) return b;
+  if (pa && !pa->value) return a;
+  if (pb && pb->value) return a;
+  if (pb && !pb->value) return b;
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Or>(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pa->value) return a;
+  if (pa && !pa->value) return b;
+  if (pb && pb->value) return b;
+  if (pb && !pb->value) return a;
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Not>(Expr a) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  if (pa) {
+    return UIntImm::make(UInt(1), !(pa->value));
+  }
+  return Expr();
+}
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITHMETIC_CONST_FOLD_H_
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
new file mode 100644
index 000000000000..c83be8933b55
--- /dev/null
+++ b/src/arithmetic/const_int_bound.cc
@@ -0,0 +1,393 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/arithmetic/const_int_bound.cc
+ */
+#include <tvm/arithmetic.h>
+#include <tvm/ir_functor_ext.h>
+#include <algorithm>
+#include "int_op_overflow.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+TVM_REGISTER_NODE_TYPE(ConstIntBoundNode);
+
+ConstIntBound ConstIntBoundNode::make(
+    int64_t min_value, int64_t max_value) {
+  auto node = make_node<ConstIntBoundNode>();
+  node->min_value = min_value;
+  node->max_value = max_value;
+  return ConstIntBound(node);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<ConstIntBoundNode>([](const ConstIntBoundNode *op, IRPrinter *p) {
+    p->stream << "ConstIntBound"
+              << "[" << op->min_value << ", "
+              << op->max_value << ']';
+  });
+
+// internal entry for const int bound
+struct ConstIntBoundAnalyzer::Entry {
+  int64_t min_value;
+  int64_t max_value;
+
+  bool is_const(int64_t value) const {
+    return min_value == max_value && min_value == value;
+  }
+};
+
+class ConstIntBoundAnalyzer::Impl :
+      public ExprFunctor<ConstIntBoundAnalyzer::Entry(const Expr&)> {
+ public:
+  void Bind(const Var& var, const Range& range) {
+    Entry a = VisitExpr(range->min);
+    Entry b = VisitExpr(range->extent);
+    Entry ret;
+    ret.min_value = a.min_value;
+    ret.max_value = InfAwareAdd(a.max_value, InfAwareAdd(b.max_value, -1));
+    Update(var, ret, false);
+  }
+
+  void Update(const Var& var,
+              const Entry& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    var_map_[var] = info;
+  }
+
+  void Update(const Var& var,
+              const ConstIntBound& info,
+              bool override) {
+    Update(var, MakeBound(info->min_value, info->max_value), override);
+  }
+
+  // Override visitor behaviors
+  Entry VisitExprDefault_(const Node* op) final {
+    return Everything(
+        static_cast<const ir::BaseExprNode*>(op)->type);
+  }
+
+  Entry VisitExpr_(const Cast* op) final {
+    Entry a = VisitExpr(op->value);
+    Entry b = Everything(op->type);
+    return Intersect(a, b);
+  }
+
+  Entry VisitExpr_(const IntImm* op) final {
+    return MakeBound(op->value, op->value);
+  }
+
+  Entry VisitExpr_(const UIntImm* op) final {
+    if (op->value <= static_cast<uint64_t>(kPosInf)) {
+      return MakeBound(op->value, op->value);
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Add* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = InfAwareAdd(a.min_value, b.min_value);
+    ret.max_value = InfAwareAdd(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Sub* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = InfAwareAdd(a.min_value, -b.max_value);
+    ret.max_value = InfAwareAdd(a.max_value, -b.min_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Mul* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return BinaryOpBoundry(a, b, InfAwareMul);
+  }
+
+  Entry VisitExpr_(const Div* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    CHECK(!b.is_const(0)) << "divide by zero";
+    // assume no division by 0
+    if (b.min_value == 0) b.min_value = 1;
+    if (b.max_value == 0) b.max_value = -1;
+    return BinaryOpBoundry(a, b, InfAwareDiv);
+  }
+
+  Entry VisitExpr_(const Mod* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    if (b.min_value > 0) {
+      int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
+      if (a.min_value >= 0) {
+        // 0 <= [a_min, a_max] < b_min
+        if (a.max_value < b.min_value) return a;
+        // other case, we can get close to 0
+        return MakeBound(0,
+                         std::min(a.max_value, b_max_cap));
+      } else {
+        return MakeBound(std::max(a.min_value, -b_max_cap),
+                         std::min(a.max_value, b_max_cap));
+      }
+    } else {
+      CHECK(!b.is_const(0)) << "mod by zero";
+      // mod by negative value is rare,
+      // and we just use the simpliest rule.
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Min* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = std::min(a.min_value, b.min_value);
+    ret.max_value = std::min(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Max* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = std::max(a.min_value, b.min_value);
+    ret.max_value = std::max(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Select* op) final {
+    Entry a = VisitExpr(op->true_value);
+    Entry b = VisitExpr(op->false_value);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Call* op) final {
+    // only special handle >> and & which can be
+    // used for index calculation.
+    if (op->is_intrinsic(Call::shift_right)) {
+      return VisitRightShift(op);
+    } else if (op->is_intrinsic(Call::bitwise_and)) {
+      return VisitBitwiseAnd(op);
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Variable* op) final {
+    Var v = GetRef<Var>(op);
+    auto it = var_map_.find(v);
+    if (it != var_map_.end()) {
+      return it->second;
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitRightShift(const Call* op) {
+    Entry a = VisitExpr(op->args[0]);
+    Entry b = VisitExpr(op->args[1]);
+    return BinaryOpBoundry(a, b, InfAwareRightShift);
+  }
+
+  Entry VisitBitwiseAnd(const Call* op) {
+    Entry a = VisitExpr(op->args[0]);
+    Entry b = VisitExpr(op->args[1]);
+    // handle positive index case.
+    if (a.min_value >= 0 && b.min_value >= 0) {
+      return MakeBound(0, std::min(a.max_value, b.max_value));
+    } else {
+      if (b.min_value >= 0) {
+        return MakeBound(0, b.max_value);
+      }
+      if (a.min_value >= 0) {
+        return MakeBound(0, a.max_value);
+      }
+      return Everything(op->type);
+    }
+  }
+
+ private:
+  // internal variable map
+  std::unordered_map<Var, Entry, ExprHash, ExprEqual> var_map_;
+  // constants: the limit value means umlimited
+  // NOTE: kNegInf/kPosInf are used to represent infinity.
+  static const constexpr int64_t kNegInf = ConstIntBoundNode::kNegInf;
+  static const constexpr int64_t kPosInf = ConstIntBoundNode::kPosInf;
+  static_assert(-kNegInf == kPosInf, "invariant of inf");
+  // internal helper functions
+  /*!
+   * \brief Get boundary of binary op who are monotonic wrt to one argument.
+   * \param param a The entry of the left operand.
+   * \param param a The entry of the right operand.
+   * \param op The operator.
+   * \tparam F the operator function type.
+   * \return The result.
+   */
+  template<typename F>
+  static Entry BinaryOpBoundry(Entry a, Entry b, const F& op) {
+    Entry ret;
+    // The boundary point must be shihft of the original boundary.
+    int64_t v1 = op(a.min_value, b.min_value);
+    int64_t v2 = op(a.max_value, b.max_value);
+    int64_t v3 = op(a.min_value, b.max_value);
+    int64_t v4 = op(a.max_value, b.min_value);
+    ret.min_value = std::min(std::min(std::min(v1, v2), v3), v4);
+    ret.max_value = std::max(std::max(std::max(v1, v2), v3), v4);
+    return ret;
+  }
+  /*!
+   * \brief Compute x + y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareAdd(int64_t x, int64_t y) {
+    if (x == kPosInf) {
+      CHECK(y != kNegInf);
+      return kPosInf;
+    }
+    if (x == kNegInf) {
+      CHECK(y != kPosInf);
+      return kNegInf;
+    }
+    if (y == kPosInf || y == kNegInf) return y;
+    if (WillOverflow<Add>(x, y, kNegInf, kPosInf)) {
+      if (x > 0) return kPosInf;
+      return kNegInf;
+    }
+    return x + y;
+  }
+  /*!
+   * \brief Compute x * y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareMul(int64_t x, int64_t y) {
+    if (!WillOverflow<Mul>(x, y, kNegInf, kPosInf)) return x * y;
+    if ((x > 0 && y > 0) || (x < 0 && y < 0)) return kPosInf;
+    return kNegInf;
+  }
+  /*!
+   * \brief Compute x / y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareDiv(int64_t x, int64_t y) {
+    CHECK_NE(y, 0);
+    if (x == kPosInf || x == kNegInf) {
+      if (y > 0) return x;
+      return -x;
+    }
+    return x / y;
+  }
+  /*!
+   * \brief Compute x / y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareRightShift(int64_t x, int64_t y) {
+    if (x == kPosInf || x == kNegInf) return x;
+    return x >> y;
+  }
+  /*!
+   * \brief Make a new bound entry.
+   */
+  static Entry MakeBound(int64_t min_value, int64_t max_value) {
+    Entry e;
+    e.min_value = min_value;
+    e.max_value = max_value;
+    return e;
+  }
+  /*!
+   * \brief Create union of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Union(Entry a, Entry b) {
+    Entry ret;
+    ret.min_value = std::min(a.min_value, b.min_value);
+    ret.max_value = std::max(a.max_value, b.max_value);
+    return ret;
+  }
+  /*!
+   * \brief Create intersect of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Intersect(Entry a, Entry b) {
+    Entry ret;
+    ret.min_value = std::max(a.min_value, b.min_value);
+    ret.max_value = std::min(a.max_value, b.max_value);
+    return ret;
+  }
+  /*!
+   * \brief return everything dtype can represent.
+   * \param dtype The data type.
+   * \return Bound that represent everything dtype can represent.
+   */
+  static Entry Everything(Type dtype) {
+    if (!dtype.is_int() && !dtype.is_uint()) {
+      return MakeBound(kNegInf, kPosInf);
+    }
+    Entry ret;
+    int64_t vbits = dtype.bits() - static_cast<int>(dtype.is_int());
+    if (dtype.is_uint()) {
+      ret.min_value = 0;
+    } else {
+      if (vbits >= 63) {
+        ret.min_value = kNegInf;
+      } else {
+        ret.min_value = -(static_cast<int64_t>(1) << vbits);
+      }
+    }
+    if (vbits >= 63) {
+      ret.max_value = kPosInf;
+    } else {
+      ret.max_value = (static_cast<int64_t>(1) << vbits) - 1;
+    }
+    return ret;
+  }
+};
+
+ConstIntBound ConstIntBoundAnalyzer::operator()(const Expr& expr) {
+  Entry ret = impl_->VisitExpr(expr);
+  return ConstIntBoundNode::make(ret.min_value, ret.max_value);
+}
+
+void ConstIntBoundAnalyzer::Update(const Var& var,
+                                   const ConstIntBound& info,
+                                   bool override) {
+  impl_->Update(var, info, override);
+}
+
+void ConstIntBoundAnalyzer::Bind(const Var& var, const Range& range) {
+  impl_->Bind(var, range);
+}
+
+std::function<void()> ConstIntBoundAnalyzer::EnterConstraint(const Expr& constraint) {
+  return nullptr;
+}
+
+ConstIntBoundAnalyzer::ConstIntBoundAnalyzer(Analyzer* parent)
+    : impl_(new Impl()) {
+}
+
+ConstIntBoundAnalyzer::~ConstIntBoundAnalyzer() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/int_op_overflow.h b/src/arithmetic/int_op_overflow.h
new file mode 100644
index 000000000000..ef637b4b9521
--- /dev/null
+++ b/src/arithmetic/int_op_overflow.h
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file int_op_overflow.h
+ * \brief Utility functions to detect if an integer op will overflow.
+ */
+#ifndef TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
+#define TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
+
+#include <limits>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Check if an integer op with operand x, y will overflow.
+ * \param x The left operand.
+ * \param y The left operand.
+ * \param min_value The minimum value of the domain.
+ * \param max_value The maximum value of the domain.
+ * \return Whether overflow can happen.
+ * \tparam Op The integer operator.
+ */
+template<typename Op>
+inline bool WillOverflow(int64_t x,
+                         int64_t y,
+                         int64_t min_value,
+                         int64_t max_value) {
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Add>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if ((y > 0) && (x > max_value - y)) return true;
+  if ((y < 0) && (x < min_value - y)) return true;
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Sub>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if ((y > 0) && (x < min_value + y)) return true;
+  if ((y < 0) && (x > max_value + y)) return true;
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Mul>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if (y == 0) return false;
+  if (y > 0) {
+    if (x < min_value / y)  return true;
+    if (x > max_value / y)  return true;
+  } else {
+    if (y == -1 && x == std::numeric_limits<int64_t>::min()) return true;
+    if (x > min_value / y)  return true;
+    if (x < max_value / y)  return true;
+  }
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Mod>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  return y == 0;
+}
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 1136cf0b1206..ed6e55904cdd 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -531,6 +531,11 @@ class IntSetEvaluator :
     CHECK(eval_vec_);
     return Eval(op->value);
   }
+  IntSet VisitExpr_(const Select* op, const Expr& e) final {
+    IntSet true_set = this->Eval(op->true_value);
+    IntSet false_set = this->Eval(op->false_value);
+    return Union({false_set, true_set});
+  }
   IntSet VisitExprDefault_(const Node* op, const Expr& e) final {
     LOG(WARNING) << "cannot evaluate set type " << e->type_key();
     return IntSet::everything();
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
index e28fe2a9d958..cc2a4c307997 100644
--- a/src/arithmetic/int_set_internal.h
+++ b/src/arithmetic/int_set_internal.h
@@ -54,23 +54,6 @@ struct StrideSet : public IntSetNode {
   TVM_DECLARE_NODE_TYPE_INFO(StrideSet, IntSetNode);
 };
 
-/*!
- * \brief Set represented by range of ModularEntry.
- *  Used for front-end modular analysis.
- */
-struct ModularSet : public IntSetNode {
-  /*! \brief Internal modular entry */
-  ModularEntry e;
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("base", &(e.base));
-    v->Visit("coeff", &(e.coeff));
-  }
-  static constexpr const char* _type_key = "ModularSet";
-  TVM_DECLARE_NODE_TYPE_INFO(ModularSet, IntSetNode);
-};
-
-
 }  // namespace arith
 }  // namespace tvm
 
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
deleted file mode 100644
index d79300eb7782..000000000000
--- a/src/arithmetic/modular.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file modular.cc
- * \brief Modular analysis
- */
-#include <tvm/ir.h>
-#include <tvm/ir_functor_ext.h>
-#include <tvm/ir_visitor.h>
-#include <tvm/arithmetic.h>
-#include <limits>
-#include "int_set_internal.h"
-
-namespace tvm {
-namespace arith {
-
-using namespace ir;
-
-class ModularEvaluator
-    : public ExprFunctor<ModularEntry(const Expr&)> {
- public:
-  explicit ModularEvaluator(
-      const std::unordered_map<
-      const Variable*, ModularEntry>& mod_map)
-      : mod_map_(mod_map) {
-  }
-  ModularEntry Eval(const Expr& e) {
-    return VisitExpr(e);
-  }
-  // default
-  ModularEntry VisitExprDefault_(const Node*) final {
-    return ModularEntry::everything();
-  }
-  // override combination rules.
-  ModularEntry VisitExpr_(const IntImm* op) final {
-    if (op->value < std::numeric_limits<int>::max()) {
-      ModularEntry ret;
-      ret.base = static_cast<int>(op->value);
-      ret.coeff = 0;
-      return ret;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const UIntImm* op) final {
-    if (op->value < static_cast<uint64_t>(
-            std::numeric_limits<int>::max())) {
-      ModularEntry ret;
-      ret.base = static_cast<int>(op->value);
-      ret.coeff = 0;
-      return ret;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const Variable* op) final {
-    auto it = mod_map_.find(op);
-    if (it != mod_map_.end()) {
-      return it->second;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const Add* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base + b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Sub* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base - b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Mul* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    // Simplification rule, x, y, z are in Z
-    // (p x + n) (q y + m)
-    // -> pq xy + pm x + qn y + mn
-    // -> pq z + pm x + qn y + mn
-    int pq = a.coeff * b.coeff;
-    int pm = a.coeff * b.base;
-    int qn = a.base * b.coeff;
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
-    ret.base = BaseSimplify(a.base * b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Div* op) final {
-    // a c x  / c -> a x
-    // We cannot do cases where offset is non-zero
-    // because of different integer rounding in pos/neg
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    if (b.coeff == 0 &&
-        a.base == 0) {
-      CHECK_NE(b.base, 0);
-      if (a.coeff % b.base == 0) {
-        ModularEntry ret;
-        ret.coeff = a.coeff / b.base;
-        ret.base = 0;
-        return ret;
-      }
-    }
-    return ModularEntry::everything();
-  }
-
- private:
-  const std::unordered_map<
-    const Variable*, ModularEntry>& mod_map_;
-  friend struct ModularEntry;
-  // simplify the base by putting it in range.
-  static int BaseSimplify(int base, int coeff) {
-    if (coeff == 0) return base;
-    base = base % coeff;
-    if (base < 0) base += coeff;
-    return base;
-  }
-  static int ZeroAwareGCD(int a, int b) {
-    CHECK_GE(a, 0);
-    CHECK_GE(b, 0);
-    if (a < b) std::swap(a, b);
-    if (b == 0) return a;
-    // perform GCD (greatest common divisor)
-    // ax + by = gcd(a, b) z if a != 0, b != 0
-    while (a % b != 0) {
-      a = a % b;
-      std::swap(a, b);
-    }
-    return b;
-  }
-};
-
-ModularEntry ModularEntry::Add(const ModularEntry& a,
-                               const ModularEntry& b) {
-  ModularEntry ret;
-  ret.coeff = ModularEvaluator::ZeroAwareGCD(a.coeff, b.coeff);
-  ret.base = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
-  return ret;
-}
-
-
-ModularEntry EvalModular(
-    const Expr& e,
-    const std::unordered_map<const Variable*, ModularEntry>& mod_map) {
-  return ModularEvaluator(mod_map)(e);
-}
-
-IntSet EvalModular(const Expr& e,
-                   const Map<Var, IntSet>& mod_map) {
-  std::unordered_map<const Variable*, ModularEntry> mmap;
-  for (auto& kv : mod_map) {
-    const ModularSet* m = kv.second.as<ModularSet>();
-    CHECK(m) << "Need to pass ModularSet for Modular Analysis";
-    mmap[kv.first.get()] = m->e;
-  }
-  NodePtr<ModularSet> n = make_node<ModularSet>();
-  n->e = ModularEvaluator(mmap)(e);
-  return IntSet(n);
-}
-
-}  // namespace arith
-}  // namespace tvm
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
new file mode 100644
index 000000000000..8112beef7551
--- /dev/null
+++ b/src/arithmetic/modular_set.cc
@@ -0,0 +1,344 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file modular_set.cc
+ * \brief Modular set analysis
+ */
+#include <tvm/arithmetic.h>
+#include <tvm/expr_operator.h>
+#include <tvm/ir_functor_ext.h>
+#include <limits>
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+TVM_REGISTER_NODE_TYPE(ModularSetNode);
+
+ModularSet ModularSetNode::make(int64_t coeff, int64_t base) {
+  auto node = make_node<ModularSetNode>();
+  node->coeff = coeff;
+  node->base = base;
+  return ModularSet(node);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<ModularSetNode>([](const ModularSetNode *op, IRPrinter *p) {
+    p->stream << "ModularSet("
+              << "coeff=" << op->coeff << ", base="
+              << op->base << ')';
+  });
+
+
+// internal entry for const int bound
+struct ModularSetAnalyzer::Entry {
+  int64_t coeff{1};
+  int64_t base{0};
+
+  bool is_const() const {
+    return coeff == 0;
+  }
+};
+
+class ModularSetAnalyzer::Impl :
+      public ExprFunctor<ModularSetAnalyzer::Entry(const Expr&)> {
+ public:
+  explicit Impl(Analyzer* parent)
+      : parent_(parent) {}
+
+  void Update(const Var& var,
+              const ModularSet& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    Entry e;
+    e.coeff = info->coeff;
+    e.base = info->base;
+    var_map_[var] = e;
+  }
+
+  // Detect useful constraints and use them in the analysis scope.
+  std::function<void()> EnterConstraint(const Expr& constraint) {
+    PVar<Var> var;
+    PVar<Integer> coeff, base;
+    // pattern match interesting constraints
+    if (((var % coeff) == base).Match(constraint)) {
+      Entry entry;
+      entry.coeff = coeff.Eval()->value;
+      entry.base = base.Eval()->value;
+      return UpdateByIntersect(var.Eval(), entry);
+    }
+    return nullptr;
+  }
+
+  // Override visitor behaviors
+  Entry VisitExprDefault_(const Node* op) final {
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Cast* op) final {
+    return VisitExpr(op->value);
+  }
+
+  Entry VisitExpr_(const IntImm* op) final {
+    Entry ret;
+    ret.base = op->value;
+    ret.coeff = 0;
+    return ret;
+  }
+
+  Entry VisitExpr_(const UIntImm* op) final {
+    if (op->value < std::numeric_limits<int64_t>::max()) {
+      Entry ret;
+      ret.base = static_cast<int>(op->value);
+      ret.coeff = 0;
+      return ret;
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitExpr_(const Add* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    ret.base = BaseSimplify(a.base + b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Sub* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    ret.base = BaseSimplify(a.base - b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Mul* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    // Simplification rule, x, y, z are in Z
+    // (p x + n) (q y + m)
+    // -> pq xy + pm x + qn y + mn
+    // -> pq z + pm x + qn y + mn
+    int64_t pq = a.coeff * b.coeff;
+    int64_t pm = a.coeff * b.base;
+    int64_t qn = a.base * b.coeff;
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
+    ret.base = BaseSimplify(a.base * b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry DivByConst(const Expr& lhs,
+                   int64_t val,
+                   bool round_down) {
+    Entry a = VisitExpr(lhs);
+    CHECK_NE(val, 0);
+    if (a.coeff % val == 0) {
+      Entry ret;
+      if (a.base == 0) {
+        // a c x  / c -> a x
+        ret.coeff = std::abs(a.coeff / val);
+        ret.base = 0;
+        return ret;
+      }
+      // positive division have a clear rounding mode.
+      // Only handle case where we clearly know we need to round down.
+      if (a.base > 0 && val > 0 &&
+          (round_down || parent_->CanProveGreaterEqual(lhs, 0))) {
+        ret.coeff = a.coeff / val;
+        ret.base = a.base / val;
+        return ret;
+      }
+    }
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Div* op) final {
+    Entry b = VisitExpr(op->b);
+    if (b.is_const()) {
+      return DivByConst(op->a, b.base, false);
+    }
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Min* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Max* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Select* op) final {
+    Entry a = VisitExpr(op->true_value);
+    Entry b = VisitExpr(op->false_value);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Call* op) final {
+    // only special handle >> which can be
+    // used for index calculation.
+    if (op->is_intrinsic(Call::shift_right)) {
+      return VisitRightShift(op);
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitExpr_(const Variable* op) final {
+    Var v = GetRef<Var>(op);
+    auto it = var_map_.find(v);
+    if (it != var_map_.end()) {
+      return it->second;
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitRightShift(const Call* op) {
+    Entry b = VisitExpr(op->args[1]);
+    // a c x  / c -> a x
+    if (b.is_const()) {
+      return DivByConst(op->args[0], 1 << b.base, true);
+    }
+    return Everything();
+  }
+
+ private:
+  /*! \brief pointer to parent. */
+  Analyzer* parent_{nullptr};
+  // internal variable map
+  std::unordered_map<Var, Entry, ExprHash, ExprEqual> var_map_;
+  /*!
+   * \brief Update var by intersecting entry with var's current set.
+   * \param var The variable.
+   * \param entry The entry to be updated.
+   * \return The recovery function of the scope.
+   */
+  std::function<void()> UpdateByIntersect(const Var& var, Entry entry) {
+    Entry old = Everything();
+    auto it = var_map_.find(var);
+    if (it != var_map_.end()) {
+      old = it->second;
+    }
+    var_map_[var] = Intersect(old, entry);
+    // reover function.
+    return [this, old, var]() {
+      var_map_[var] = old;
+    };
+  }
+  /*!
+   * \brief Create union of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Union(Entry a, Entry b) {
+    // {ax + y} \cup {bz + h} => {gcd(a, b) x + {y or h}}
+    int64_t coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    if (coeff == 0) {
+      if (a.base == b.base) return a;
+      return Everything();
+    }
+    int64_t base0 = a.base % coeff;
+    int64_t base1 = b.base % coeff;
+    Entry ret;
+    if (base0 == base1) {
+      ret.coeff = coeff;
+      ret.base = base0;
+      return ret;
+    } else {
+      ret.coeff = ZeroAwareGCD(ZeroAwareGCD(base0, base1), coeff);
+      ret.base = 0;
+      return ret;
+    }
+  }
+  /*!
+   * \brief Create interect of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Intersect(Entry a, Entry b) {
+    // simple rule for now: pick higher constraints.
+    // TODO(team-team): Use extended euclidean algorithm.
+    if (a.coeff == 0) return a;
+    if (b.coeff == 0) return b;
+    if (a.coeff >= b.coeff) return a;
+    return b;
+  }
+  /*!
+   * \brief Simplify base so that it is in [0, coeff) when coeff != 0.
+   * \param base The base value.
+   * \param coeff The coeff value.
+   * \return The simplified base.
+   */
+  static int64_t BaseSimplify(int64_t base, int64_t coeff) {
+    if (coeff == 0) return base;
+    base = base % coeff;
+    if (base < 0) base += coeff;
+    return base;
+  }
+  /*!
+   * \brief Take GCD of a and b.
+   * \param a The first operand.
+   * \param b The second operand.
+   * \return The result.
+   */
+  static int64_t ZeroAwareGCD(int64_t a, int64_t b) {
+    if (a < 0) a = -a;
+    if (b < 0) b = -b;
+    if (a < b) std::swap(a, b);
+    if (b == 0) return a;
+    // perform GCD (greatest common divisor)
+    // ax + by = gcd(a, b) z if a != 0, b != 0
+    while (a % b != 0) {
+      a = a % b;
+      std::swap(a, b);
+    }
+    return b;
+  }
+  /*!
+   * \brief return everything dtype can represent.
+   * \return Bound that represent everything dtype can represent.
+   */
+  static Entry Everything() {
+    Entry ret;
+    ret.coeff = 1; ret.base = 0;
+    return ret;
+  }
+};
+
+ModularSet ModularSetAnalyzer::operator()(const Expr& expr) {
+  Entry ret = impl_->VisitExpr(expr);
+  return ModularSetNode::make(ret.coeff, ret.base);
+}
+
+void ModularSetAnalyzer::Update(const Var& var,
+                                const ModularSet& info,
+                                bool override) {
+  impl_->Update(var, info, override);
+}
+
+std::function<void()> ModularSetAnalyzer::EnterConstraint(const Expr& constraint) {
+  return impl_->EnterConstraint(constraint);
+}
+
+ModularSetAnalyzer::ModularSetAnalyzer(Analyzer* parent)
+    : impl_(new Impl(parent)) {
+}
+
+ModularSetAnalyzer::~ModularSetAnalyzer() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/pattern_match.h b/src/arithmetic/pattern_match.h
index b4140d959759..20c24b330cbd 100644
--- a/src/arithmetic/pattern_match.h
+++ b/src/arithmetic/pattern_match.h
@@ -25,6 +25,17 @@
  *    // The filled value is valid until the next call to Match.
  *    return (max(x, y) + z).Eval();
  *  }
+ *
+ *  tvm::Var tx, ty;
+ *  arith::PVar<Integer> c;
+ *  arith::PVar<Var> v;
+ *  // We can match integer and Var, both of which are
+ *  // special case container of Expr
+ *  CHECK((v * c).Match(tx * 3));
+ *  CHECK_EQ(c.Eval()->value, 3);
+ *  // cannot match c to ty
+ *  CHECK(!(v * c).Match(tx * ty));
+ *
  * \endcode
  *
  * \note The pattern matcher is not threadsafe,
@@ -38,6 +49,7 @@
 
 #include <tvm/ir_pass.h>
 #include <tuple>
+#include "const_fold.h"
 
 namespace tvm {
 namespace arith {
@@ -109,6 +121,22 @@ class PEqualChecker<Expr> {
   }
 };
 
+template<>
+class PEqualChecker<Integer> {
+ public:
+  bool operator()(const Integer& lhs, const Integer& rhs) const {
+    return lhs->value == rhs->value;
+  }
+};
+
+template<>
+class PEqualChecker<Var> {
+ public:
+  bool operator()(const Var& lhs, const Var& rhs) const {
+    return lhs.same_as(rhs);
+  }
+};
+
 /*!
  * \brief Pattern variable container.
  *
@@ -123,7 +151,7 @@ template<typename T>
 class PVar : public Pattern<PVar<T> > {
  public:
   // Store PVars by reference in the expression.
-  using Nested = const PVar&;
+  using Nested = const PVar<T>&;
 
   void InitMatch_() const {
     filled_ = false;
@@ -139,12 +167,23 @@ class PVar : public Pattern<PVar<T> > {
     }
   }
 
+  template<typename NodeRefType,
+           typename = typename std::enable_if<
+             std::is_base_of<NodeRefType, T>::value>::type>
+  bool Match_(const NodeRefType& value) const {
+    if (const auto* ptr = value.template as<typename T::ContainerType>()) {
+      return Match_(GetRef<T>(ptr));
+    } else {
+      return false;
+    }
+  }
+
   T Eval() const {
     CHECK(filled_);
     return value_;
   }
 
- private:
+ protected:
   /*! \brief The matched value */
   mutable T value_;
   /*! \brief whether the variable has been filled */
@@ -171,6 +210,7 @@ class PConst : public Pattern<PConst<T> > {
   T Eval() const {
     return value_;
   }
+
  private:
   const T value_;
 };
@@ -203,7 +243,11 @@ class PBinaryExpr :
   }
 
   Expr Eval() const {
-    return NodeType::make(a_.Eval(), b_.Eval());
+    Expr lhs = a_.Eval();
+    Expr rhs = b_.Eval();
+    Expr ret = TryConstFold<NodeType>(lhs, rhs);
+    if (ret.defined()) return ret;
+    return NodeType::make(lhs, rhs);
   }
 
  private:
@@ -211,12 +255,48 @@ class PBinaryExpr :
   typename TB::Nested b_;
 };
 
+template<typename TA>
+class PConstWithTypeLike :
+      public Pattern<PConstWithTypeLike<TA> > {
+ public:
+  PConstWithTypeLike(const TA& ref, int64_t value)
+      : ref_(ref), value_(value) {}
+
+  void InitMatch_() const {}
+
+  bool Match_(const NodeRef& node) const {
+    if (const ir::IntImm* ptr = node.as<ir::IntImm>()) {
+      return ptr->value == value_;
+    } else {
+      return false;
+    }
+  }
+
+  Expr Eval() const {
+    return make_const(ref_.Eval().type(), value_);
+  }
+
+ private:
+  typename TA::Nested ref_;
+  int64_t value_;
+};
+
 
-#define TVM_PATTERN_BINARY_OP(FuncName, NodeName)             \
-  template<typename TA, typename TB>                          \
-  inline PBinaryExpr<NodeName, TA, TB>                        \
-  FuncName(const Pattern<TA>& a, const Pattern<TB>& b) {      \
+#define TVM_PATTERN_BINARY_OP(FuncName, NodeName)                   \
+  template<typename TA, typename TB>                                \
+  inline PBinaryExpr<NodeName, TA, TB>                              \
+  FuncName(const Pattern<TA>& a, const Pattern<TB>& b) {            \
     return PBinaryExpr<NodeName, TA, TB>(a.derived(), b.derived()); \
+  }                                                                 \
+  template<typename TA>                                             \
+  inline PBinaryExpr<NodeName, TA, PConstWithTypeLike<TA> >         \
+  FuncName(const Pattern<TA>& a, int64_t b) {                       \
+    return FuncName(a, PConstWithTypeLike<TA>(a.derived(), b));     \
+  }                                                                 \
+  template<typename TA>                                             \
+  inline PBinaryExpr<NodeName, PConstWithTypeLike<TA>, TA>          \
+  FuncName(int64_t b, const Pattern<TA>& a) {                       \
+    return FuncName(PConstWithTypeLike<TA>(a.derived(), b), a);     \
   }
 
 // arithmetic expressions
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
new file mode 100644
index 000000000000..b304a8dc4bf2
--- /dev/null
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -0,0 +1,650 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file rewrite_simplify.cc
+ * \brief Rewrite-rule based simplification.
+ */
+// Acknowledgement: Most rewrite-rules are from Halide.
+#include <tvm/arithmetic.h>
+#include <tvm/expr_operator.h>
+#include <tvm/ir_mutator.h>
+#include "const_fold.h"
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+// macro for doing simple rewrite
+#define TVM_TRY_REWRITE(SrcExpr, ResExpr)       \
+  if ((SrcExpr).Match(ret)) {                   \
+    return (ResExpr).Eval();                    \
+  }
+
+// macro for rewrite + recursively rewrite ResExpr
+#define TVM_TRY_RECURSIVE_REWRITE(SrcExpr, ResExpr) \
+  if ((SrcExpr).Match(ret)) {                       \
+    return RecursiveRewrite((ResExpr).Eval());      \
+  }
+
+// macro rewrite only if CondExor is true after match.
+#define TVM_TRY_REWRITE_IF(SrcExpr, ResExpr, CondExpr)  \
+  if ((SrcExpr).Match(ret) && (CondExpr)) {             \
+    return (ResExpr).Eval();                            \
+  }
+
+// macro rewrite + recursive_rewrite only if CondExor is true after match.
+#define TVM_TRY_RECURSIVE_REWRITE_IF(SrcExpr, ResExpr, CondExpr)  \
+  if ((SrcExpr).Match(ret) && (CondExpr)) {                       \
+    return RecursiveRewrite((ResExpr).Eval());                    \
+  }
+
+
+// NOTE for developers:
+//
+// We mainly focus on index expression simplification.
+// Besides the RewriteSimplifier, some cases can be better
+// handled by CanonicalSimplifier.
+//
+class RewriteSimplifier::Impl : public IRMutator {
+ public:
+  explicit Impl(Analyzer* parent)
+      : parent_(parent) {}
+
+  void Update(const Var& var,
+              const Expr& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    var_map_[var] = info;
+  }
+
+  // Run simplification in post order
+  Expr PostOrderSimplify(Expr expr, int max_iter = 2) {
+    for (int i = 0; i < max_iter; ++i) {
+      Expr new_expr = this->Mutate(expr);
+      if (new_expr.same_as(expr)) return expr;
+      expr = new_expr;
+    }
+    return expr;
+  }
+
+  Expr Mutate_(const Add* op, const Expr& self) final;
+  Expr Mutate_(const Sub* op, const Expr& self) final;
+  Expr Mutate_(const Mul* op, const Expr& self) final;
+  Expr Mutate_(const Div* op, const Expr& self) final;
+  Expr Mutate_(const Mod* op, const Expr& self) final;
+
+ private:
+  // reference to the main analyzer
+  Analyzer* parent_;
+  // counter to record recursive rewrite depth.
+  int recur_depth_{0};
+  // internal variable map
+  std::unordered_map<Var, Expr, ExprHash, ExprEqual> var_map_;
+  // maximum number of recursion allowed during a single pass.
+  static const constexpr int kMaxRecurDepth = 5;
+  // Whether x >= val
+  bool CanProveGreaterEqual(const Expr& x, int64_t val) {
+    return parent_->CanProveGreaterEqual(x, val);
+  }
+  // Whether x == val
+  bool CanProveEqual(const Expr& x, int64_t val) {
+    // TODO(tqchen) refer back to super-analyzer.
+    Expr res = Mutate(x);
+    if (const auto* ptr = res.as<ir::IntImm>()) {
+      return ptr->value == val;
+    }
+    return false;
+  }
+  // Recursive rewrite x
+  // we limit maximum depth of recursive rewrite allowed to
+  // avoid infinite loop
+  Expr RecursiveRewrite(const Expr& x) {
+    if (recur_depth_ >= kMaxRecurDepth) return x;
+    ++recur_depth_;
+    Expr res = Mutate(x);
+    --recur_depth_;
+    return res;
+  }
+
+  template<typename TA>
+  PConstWithTypeLike<TA> ZeroWithTypeLike(const Pattern<TA>& pattern) {
+    return PConstWithTypeLike<TA>(pattern.derived(), 0);
+  }
+};
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Add* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Add>();
+  Expr const_res = TryConstFold<Add>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) + ramp(b2, s2, lanes),
+                    ramp(b1 + b2, s1 + s2, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes),
+                    ramp(b1 + x, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes),
+                    ramp(x + b1, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) + broadcast(y, lanes),
+                    broadcast(x + y, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // Index rules
+    // cancelation rules
+    TVM_TRY_REWRITE((x - y) + y, x);
+    TVM_TRY_REWRITE(x + (y - x), y);
+
+    TVM_TRY_REWRITE((x - y) + (y - z), x - z);
+    TVM_TRY_REWRITE((x - y) + (z - x), z - y);
+
+    TVM_TRY_REWRITE(min(x, y - z) + z, min(x + z, y));
+    TVM_TRY_REWRITE(min(x - z, y) + z, min(x, y + z));
+    TVM_TRY_REWRITE(max(x, y - z) + z, max(x + z, y));
+    TVM_TRY_REWRITE(max(x - z, y) + z, max(x, y + z));
+    TVM_TRY_REWRITE(max(x, y) + min(x, y), x + y);
+    TVM_TRY_REWRITE(min(x, y) + max(x, y), x + y);
+    TVM_TRY_REWRITE(max(x, y) + min(y, x), x + y);
+    TVM_TRY_REWRITE(min(x, y) + max(y, x), x + y);
+
+    TVM_TRY_REWRITE_IF(min(x, y + c1) + c2, min(x + c2, y),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(min(x + c1, y) + c2, min(x, y + c2),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(max(x, y + c1) + c2, max(x + c2, y),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(max(x + c1, y) + c2, max(x, y + c2),
+                       c1.Eval()->value == -c2.Eval()->value);
+
+    // constant folding
+    // NOTE: canonicalization might better at this.
+    TVM_TRY_REWRITE((x + c1) + c2, x + (c1 + c2));
+
+    // mul co-efficient folding
+    TVM_TRY_REWRITE(x + x, x * 2);
+    TVM_TRY_REWRITE(x * y + x, x * (y + 1));
+    TVM_TRY_REWRITE(y * x + x, x * (y + 1));
+    TVM_TRY_REWRITE(x + y * x, x * (1 + y));
+    TVM_TRY_REWRITE(x + x * y, x * (1 + y));
+    TVM_TRY_REWRITE(x * y + x * z, x * (y + z));
+    TVM_TRY_REWRITE(y * x + x * z, x * (y + z));
+    TVM_TRY_REWRITE(x * y + z * x, x * (y + z));
+    TVM_TRY_REWRITE(y * x + z * x, x * (y + z));
+
+    // modular-div simplification
+    // Always pre-condition on positive integer domain
+    TVM_TRY_REWRITE_IF(
+        (x / c1) * c1 + x % c1, x,
+        CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+
+    // canonicalization rule
+    // will try rewrite again after canonicalization.
+    TVM_TRY_RECURSIVE_REWRITE(x + (c1 - y), (x - y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x + c1 + y, (x + y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x + (c1 + y), (x + y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE((y % c1) + x * c1, x * c1 + (y % c1));
+  }
+
+  // condition rules.
+  TVM_TRY_REWRITE(select(x, b1, b2) + select(x, s1, s2),
+                  select(x, b1 + s1, b2 + s2));
+  // default value
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Sub* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Sub>();
+  Expr const_res = TryConstFold<Sub>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes),
+                    ramp(b1 - b2, s1 - s2, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) - broadcast(x, lanes),
+                    ramp(b1 - x, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) - ramp(b1, s1, lanes),
+                    ramp(x - b1, 0 - s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) - broadcast(y, lanes),
+                    broadcast(x - y, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // Index rules
+    // cancelation rules
+    TVM_TRY_REWRITE((x + y) - y, x);
+    TVM_TRY_REWRITE((x + y) - x, y);
+    TVM_TRY_REWRITE(x - (y + x), 0 - y);
+    TVM_TRY_REWRITE(x - (x + y), 0 - y);
+
+    TVM_TRY_REWRITE(min(x, y) - x, min(0, y - x));
+    TVM_TRY_REWRITE(min(x, y) - y, min(x - y, 0));
+    TVM_TRY_REWRITE(max(x, y) - x, max(0, y - x));
+    TVM_TRY_REWRITE(max(x, y) - y, max(x - y, 0));
+
+    TVM_TRY_REWRITE(x - max(x, y), min(0, x - y));
+    TVM_TRY_REWRITE(y - max(x, y), min(y - x, 0));
+    TVM_TRY_REWRITE(x - min(x, y), max(0, x - y));
+    TVM_TRY_REWRITE(y - min(x, y), max(y - x, 0));
+
+    // mul co-efficient folding
+    TVM_TRY_REWRITE(x - x, ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(x * y - x, x * (y - 1));
+    TVM_TRY_REWRITE(y * x - x, x * (y - 1));
+    TVM_TRY_REWRITE(x - y * x, x * (1 - y));
+    TVM_TRY_REWRITE(x - x * y, x * (1 - y));
+    TVM_TRY_REWRITE(x * y - x * z, x * (y - z));
+    TVM_TRY_REWRITE(y * x - x * z, x * (y - z));
+    TVM_TRY_REWRITE(x * y - z * x, x * (y - z));
+    TVM_TRY_REWRITE(y * x - z * x, x * (y - z));
+
+    // constant cancelation
+    TVM_TRY_REWRITE((x + c1) - c2, x + (c1 - c2));
+    TVM_TRY_REWRITE((c1 - x) - (c2 - y), (y - x) + (c1 - c2));
+
+    // cancelization rule involving 4 operands
+    TVM_TRY_REWRITE((x + y) - (x + z), y - z);
+    TVM_TRY_REWRITE((x + y) - (z + x), y - z);
+    TVM_TRY_REWRITE((y + x) - (z + x), y - z);
+    TVM_TRY_REWRITE((y + x) - (x + z), y - z);
+
+    TVM_TRY_REWRITE(min(x + y, z) - x,  min(y, z - x));
+    TVM_TRY_REWRITE(min(y + x, z) - x,  min(y, z - x));
+    TVM_TRY_REWRITE(min(z, x + y) - x,  min(z - x, y));
+    TVM_TRY_REWRITE(min(z, y + x) - x,  min(z - x, y));
+
+    TVM_TRY_REWRITE(x - min(x + y, z),  max(0 - y, x - z));
+    TVM_TRY_REWRITE(x - min(y + x, z),  max(0 - y, x - z));
+    TVM_TRY_REWRITE(x - min(z, x + y),  max(x - z, 0 - y));
+    TVM_TRY_REWRITE(x - min(z, y + x),  max(x - z, 0 - y));
+
+    TVM_TRY_REWRITE(min(x, y) - min(y, x), ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(max(x, y) - max(y, x), ZeroWithTypeLike(x));
+
+    TVM_TRY_REWRITE_IF(min(b1, b2) - min(s1, s2), b1 - s1,
+                       CanProveEqual(((b1 - s1) - (b2 - s2)).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(b1, b2) - min(s1, s2), b1 - s2,
+                       CanProveEqual(((b1 - s2) - (b2 - s1)).Eval(), 0));
+    TVM_TRY_REWRITE_IF(max(b1, b2) - max(s1, s2), b1 - s1,
+                       CanProveEqual(((b1 - s1) - (b2 - s2)).Eval(), 0));
+    TVM_TRY_REWRITE_IF(max(b1, b2) - max(s1, s2), b1 - s2,
+                       CanProveEqual(((b1 - s2) - (b2 - s1)).Eval(), 0));
+
+    // modular-div simplification
+    // Always pre-condition on positive integer domain
+    TVM_TRY_REWRITE_IF(x - (x / c1) * c1, x % c1,
+                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x / c1) * c1 - x, 0 - (x % c1),
+                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x + c1) / c3  - (x + c2) / c3,
+                       ((x + (c1 % c3)) % c3 + (c1 - c2)) / c3,
+                       CanProveGreaterEqual(x.Eval(), -c2.Eval()->value) &&
+                       c1.Eval()->value >= c2.Eval()->value &&
+                       c3.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x + c1) / c3  - x / c3,
+                       ((x + (c1 % c3)) % c3 + c1) / c3,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       c1.Eval()->value >= 0 &&
+                       c3.Eval()->value > 0);
+    // canonicalization rule
+    // will try rewrite again after canonicalization.
+    TVM_TRY_REWRITE(x - c1, x + (0 - c1));
+    TVM_TRY_RECURSIVE_REWRITE((x + c1) - y, (x - y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x - (y - z), (x + z) - y);
+    TVM_TRY_RECURSIVE_REWRITE(x - y * c1, x + y * (0 - c1));
+  }
+
+  // condition rules.
+  TVM_TRY_REWRITE(select(x, b1, b2) - select(x, s1, s2),
+                  select(x, b1 - s1, b2 - s2));
+  TVM_TRY_REWRITE(select(x, y, z) - z,
+                  select(x, y - z, ZeroWithTypeLike(z)));
+  TVM_TRY_REWRITE(select(x, y, z) - y,
+                  select(x, ZeroWithTypeLike(y), z - y));
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Mul* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Mul>();
+  Expr const_res = TryConstFold<Mul>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes),
+                    broadcast(x * y, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) * broadcast(x, lanes),
+                    ramp(b1 * x, s1 * x, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) * ramp(b1, s1, lanes),
+                    ramp(b1 * x, s1 * x, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // constant simplification rule
+    TVM_TRY_REWRITE((x + c1) * c2, x * c2 + c1 * c2);
+    TVM_TRY_REWRITE((x * c1) * c2, x * (c1 * c2));
+    TVM_TRY_REWRITE(min(x, y) * max(x, y), x * y);
+    TVM_TRY_REWRITE(max(x, y) * min(x, y), x * y);
+
+    // canonicalization
+    TVM_TRY_RECURSIVE_REWRITE(x * (c1 * y), (x * y) * c1);
+    TVM_TRY_RECURSIVE_REWRITE_IF(
+        (x - y) * c1, (y - x) * (0 - c1),
+        c1.Eval()->value < 0);
+  }
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Div* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Div>();
+  Expr const_res = TryConstFold<Div>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) / broadcast(y, lanes),
+                    broadcast(x / y, lanes));
+    // ramp / bcast
+    if ((ramp(b1, c1, lanes) / broadcast(c2, lanes)).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val % c2val == 0) {
+        return ramp(b1 / c2, c1 / c2, lanes).Eval();
+      }
+      // If all possible indices in ramp are the same.
+      if (CanProveGreaterEqual(b1.Eval(), 0)) {
+        ModularSet bmod = parent_->modular_set(b1.Eval());
+        int64_t ramp_min = bmod->base / c2val;
+        int64_t ramp_max = (bmod->base + (lanes.Eval() - 1) * c1val) / c2val;
+        if (bmod->coeff % c2val == 0 && ramp_min == ramp_max) {
+          return broadcast(b1 / c2, lanes).Eval();
+        }
+      }
+    }
+  }
+
+  if (IsIndexType(op->type)) {
+    // Be-aware of the division rules:
+    // We adopt the default C division uses truncation instead of floordiv.
+    // This means most rules need to check non-negativeness of the operands.
+
+    // while it is always true for trunc div
+    // restrict to common case(positive div)
+    TVM_TRY_REWRITE_IF((x / c1) / c2, x / (c1 * c2),
+                       c1.Eval()->value > 0 && c2.Eval()->value > 0);
+
+    TVM_TRY_REWRITE_IF((x / c1 + c2) / c3, (x + c1 * c2) / (c1 * c3),
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value >= 0 &&
+                       c3.Eval()->value > 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    if (((x * c1) / c2).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val > 0 && c2val > 0) {
+        if (c1val % c2val == 0) return (x * (c1 / c2)).Eval();
+        if (c2val % c1val == 0) return (x / (c2 / c1)).Eval();
+      }
+    }
+
+    // Rules involving 2-operands.
+    TVM_TRY_REWRITE_IF((x * c1 + y) / c2, x * (c1 / c2) + y / c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(x * c1, y) / c2, min(x * (c1 / c2), y / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(max(x * c1, y) / c2, max(x * (c1 / c2), y / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((y + x * c1) / c2, y / c2 + x * (c1 / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(y, x * c1) / c2, min(y / c2, x * (c1 / c2)),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(max(y, x * c1) / c2, max(y / c2, x * (c1 / c2)),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    // Rules involving 3-operands.
+    TVM_TRY_REWRITE_IF((x * c1 + y + z) / c2, x * (c1 / c2) + (y + z)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * c1 - y + z) / c2, x * (c1 / c2) + (z - y)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((z - y).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * c1 + y - z) / c2, x * (c1 / c2) + (y - z)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y - z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((y + x * c1 + z) / c2, x * (c1 / c2) + (y + z) / c2,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + c1) / c2, x / c2 + c1 / c2,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + y) / x, y / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + x) / x, y / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(((x + y) + z) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF(((y + x) + z) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + (z + x)) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + (x + z)) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * y) / y, x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y * x) / y, x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * z + y) / z, x + y / z,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((z * x + y) / z, x + y / z,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + x * z) / z, y / z + x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + z * x) / z, y / z + x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+  }
+  return ret;
+}
+
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Mod* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Mod>();
+  Expr const_res = TryConstFold<Mod>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) % broadcast(y, lanes),
+                    broadcast(x % y, lanes));
+
+    // ramp % bcast
+    if ((ramp(b1, c1, lanes) % broadcast(c2, lanes)).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val % c2val == 0) {
+        return broadcast(b1 % c2, lanes).Eval();
+      }
+      // If all possible indices in ramp are the same.
+      if (CanProveGreaterEqual(b1.Eval(), 0)) {
+        ModularSet bmod = parent_->modular_set(b1.Eval());
+        int64_t ramp_min = bmod->base / c2val;
+        int64_t ramp_max = (bmod->base + (lanes.Eval() - 1) * c1val) / c2val;
+        if (bmod->coeff % c2val == 0) {
+          if (ramp_min == ramp_max) {
+            return ramp(bmod->base % c2, c1, lanes).Eval();
+          } else {
+            return (ramp(bmod->base % c2, c1, lanes) % broadcast(c2, lanes)).Eval();
+          }
+        }
+      }
+    }
+  }
+
+  if (IsIndexType(op->type)) {
+    // Be-aware of the division rules:
+    // We adopt the default C division uses truncation instead of floordiv.
+    // This means most rules need to check non-negativeness of the operands.
+    TVM_TRY_REWRITE_IF((x * c1) % c2, ZeroWithTypeLike(x),
+                       c2.Eval()->value != 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0);
+
+    TVM_TRY_REWRITE_IF((x * c1 + y) % c2, y % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + c1) % c2, x % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + y * c1) % c2, x % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    // try modular analysis
+    if ((x % c1).Match(ret)) {
+      ModularSet mod = parent_->modular_set(x.Eval());
+      int64_t c1val = c1.Eval()->value;
+      if (mod->coeff % c1val == 0 &&
+          CanProveGreaterEqual(x.Eval(), 0)) {
+        return (mod->base % c1).Eval();
+      }
+    }
+  }
+  return ret;
+}
+
+
+Expr RewriteSimplifier::operator()(const Expr& expr) {
+  return impl_->PostOrderSimplify(expr);
+}
+
+void RewriteSimplifier::Update(const Var& var,
+                               const Expr& info,
+                               bool override) {
+  impl_->Update(var, info, override);
+}
+
+
+RewriteSimplifier::RewriteSimplifier(Analyzer* parent)
+    : impl_(new Impl(parent)) {
+}
+
+RewriteSimplifier::~RewriteSimplifier() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/codegen/codegen_common.h b/src/codegen/codegen_common.h
deleted file mode 100644
index 5e76af12e583..000000000000
--- a/src/codegen/codegen_common.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file codegen_common.h
- * \brief Common utility for codegen.
- */
-#ifndef TVM_CODEGEN_CODEGEN_COMMON_H_
-#define TVM_CODEGEN_CODEGEN_COMMON_H_
-
-#include <tvm/arithmetic.h>
-#include "../arithmetic/compute_expr.h"
-
-namespace tvm {
-namespace codegen {
-
-/*!
- * \brief Visit AssertStmt recursively, update align_map from condition.
- * \param op The AssertStmt
- * \param align_map The alignmap
- * \param fvisit The recursive visitor
- * \tparam FVisit the recursive visitor
- */
-template<typename FVisit>
-inline void VisitAssert(
-    const ir::AssertStmt* op,
-    std::unordered_map<const Variable*, arith::ModularEntry>* align_map,
-    FVisit fvisit) {
-  using namespace ir;
-  auto& align_map_ = *align_map;
-  // Detect useful invariant pattern and use them to visit child.
-  // Pattern: Var % const  == 0
-  // TODO(tqchen) merge these pattern to a generic scope info visitor.
-  if (const EQ* eq = op->condition.as<EQ>()) {
-    const Mod* mod = eq->a.as<Mod>();
-    int64_t factor = 0, offset = 0;
-    if (mod && arith::GetConst(eq->b, &offset)) {
-      const Variable *var = mod->a.as<Variable>();
-      if (var && arith::GetConst(mod->b, &factor)) {
-        arith::ModularEntry old = align_map_[var];
-        if (factor > old.coeff) {
-          arith::ModularEntry e;
-          e.coeff = static_cast<int>(factor);
-          e.base = static_cast<int>(offset);
-          // new alignment info,
-          align_map_[var] = e;
-          fvisit(op->body);
-          // restore old info
-          align_map_[var] = old;
-          return;
-        }
-      }
-    }
-  }
-  fvisit(op->body);
-}
-
-}  // namespace codegen
-}  // namespace tvm
-
-#endif  // TVM_CODEGEN_CODEGEN_COMMON_H_
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index d1a0716bc1d9..205d99f1ab65 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -156,6 +156,7 @@ inline int DetectROCMComputeVersion(const std::string& target) {
 }
 
 runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
+  InitializeLLVM();
   CHECK(target.length() >= 4 &&
         target.substr(0, 4) == "rocm");
   std::ostringstream config;
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index f80bd9e8d436..6b69f97a66fe 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -9,7 +9,6 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include "codegen_llvm.h"
 #include "codegen_cpu.h"
-#include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
 
@@ -84,9 +83,9 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) {
 void CodeGenLLVM::InitFuncState() {
   var_map_.clear();
   alias_var_set_.clear();
-  align_map_.clear();
   alloc_storage_info_.clear();
   volatile_buf_.clear();
+  analyzer_.reset(new arith::Analyzer());
 }
 
 void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
@@ -381,14 +380,16 @@ void CodeGenLLVM::GetAlignment(Type t,
     *p_native_bits = native_vector_bits_;
   }
 
-  arith::ModularEntry me = arith::EvalModular(index, align_map_);
+  arith::ModularSet me = analyzer_->modular_set(index);
+  int64_t base = me->base;
+  int64_t coeff = me->coeff;
 
   int align_bits = t.bits();
   while (align_bits < max_align_bits &&
-         me.base % 2  == 0 &&
-         me.coeff % 2 == 0) {
-    me.base =  me.base / 2;
-    me.coeff =  me.coeff / 2;
+         base % 2  == 0 &&
+         coeff % 2 == 0) {
+    base =  base / 2;
+    coeff =  coeff / 2;
     align_bits *= 2;
   }
   if (align_bits < 8) {
@@ -874,7 +875,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Select* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const Let* op) {
   CHECK(!var_map_.count(op->var.get()));
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   return MakeValue(op->body);
 }
 
@@ -998,6 +999,7 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
 
 void CodeGenLLVM::VisitStmt_(const For* op) {
   CHECK(is_zero(op->min));
+  analyzer_->Bind(op->loop_var, Range::make_by_min_extent(op->min, op->extent));
   if (op->for_type == ForType::Unrolled) {
     LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, "
                  << " consider set unroll_explicit=True";
@@ -1078,6 +1080,7 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv);
+        analyzer_->Bind(iv->var, Range::make_by_min_extent(0, op->value));
       }
     }
   } else if (op->attr_key == ir::attr::storage_scope) {
@@ -1099,21 +1102,19 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AssertStmt* op) {
-  VisitAssert(op, &align_map_, [this](const Stmt& body) {
-      this->VisitStmt(body);
-    });
+  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  this->VisitStmt(op->body);
 }
 
 void CodeGenLLVM::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  CHECK(!align_map_.count(op->var.get()));
   if (op->var.type().is_handle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(op->var.get());
     }
   }
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 080306310370..ead1af883166 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -23,7 +23,6 @@ namespace codegen {
 
 using namespace ir;
 
-
 /*!
  * \brief A base class to generate a LLVM.
  */
@@ -267,8 +266,8 @@ class CodeGenLLVM :
   std::unordered_map<std::string, llvm::Constant*> str_map_;
   // Whether current function is restricted
   bool is_restricted_{true};
-  // The alignment information
-  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
+  // The analyzer information
+  std::unique_ptr<arith::Analyzer> analyzer_;
   // set of var that are not restricted(can alias)
   std::unordered_set<const Variable*> alias_var_set_;
   // set of volatile buffer.
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 2d416d34ea0c..0a9361c57de7 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -166,6 +166,7 @@ inline int DetectCUDAComputeVersion() {
 }
 
 runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
+  InitializeLLVM();
   CHECK(target.length() >= 5 &&
         target.substr(0, 5) == "nvptx");
   int compute_ver = DetectCUDAComputeVersion();
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 812fee4a114e..8b1cabd9e386 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -6,7 +6,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
 #include <string>
-#include "../codegen_common.h"
+#include "../../arithmetic/compute_expr.h"
 #include "codegen_spirv.h"
 
 namespace tvm {
@@ -66,7 +66,7 @@ void CodeGenSPIRV::InitFuncState() {
   std::fill(workgroup_size_, workgroup_size_ + 3, 1);
   var_map_.clear();
   storage_info_.clear();
-  align_map_.clear();
+  analyzer_.reset(new arith::Analyzer());
   builder_.reset(new spirv::IRBuilder());
   builder_->InitHeader();
 }
@@ -217,7 +217,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Select* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const Let* op) {
   CHECK(!var_map_.count(op->var.get()));
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   return MakeValue(op->body);
 }
 
@@ -378,9 +378,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
           CHECK_EQ(ramp->lanes, op->type.lanes());
-          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
-          CHECK((me.coeff % ramp->lanes) == 0 &&
-                (me.base % ramp->lanes)  == 0)
+          arith::ModularSet me = analyzer_->modular_set(ramp->base);
+          CHECK((me->coeff % ramp->lanes) == 0 &&
+                (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
               ramp->base / make_const(ramp->base.type(), ramp->lanes));
@@ -458,9 +458,9 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
           CHECK_EQ(ramp->lanes, op->value.type().lanes());
-          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
-          CHECK((me.coeff % ramp->lanes) == 0 &&
-                (me.base % ramp->lanes)  == 0)
+          arith::ModularSet me = analyzer_->modular_set(ramp->base);
+          CHECK((me->coeff % ramp->lanes) == 0 &&
+                (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
               ramp->base / make_const(ramp->base.type(), ramp->lanes));
@@ -477,6 +477,7 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
 
 void CodeGenSPIRV::VisitStmt_(const For* op) {
   CHECK(is_zero(op->min));
+  analyzer_->Bind(op->loop_var, Range::make_by_min_extent(op->min, op->extent));
   spirv::Value init_value = MakeValue(op->min);
   spirv::Value extent_value = MakeValue(op->extent);
   // Must get init label after making value(to make sure they are correct)
@@ -589,6 +590,7 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv, op->value);
+        analyzer_->Bind(iv->var, Range::make_by_min_extent(0, op->value));
       }
     }
   } else if (op->attr_key == ir::attr::storage_scope) {
@@ -605,17 +607,15 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AssertStmt* op) {
-  VisitAssert(op, &align_map_, [this](const Stmt& body) {
-      this->VisitStmt(body);
-    });
+  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  this->VisitStmt(op->body);
 }
 
 void CodeGenSPIRV::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  CHECK(!align_map_.count(op->var.get()));
   CHECK(!op->var.type().is_handle());
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index 6a43182f7f2e..94cf761b9f84 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -122,8 +122,8 @@ class CodeGenSPIRV:
   std::unordered_map<const Variable*, StorageInfo> storage_info_;
   // The definition of local variable.
   std::unordered_map<const Variable*, spirv::Value> var_map_;
-  // The alignment information
-  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
+  // The analyzer.
+  std::unique_ptr<arith::Analyzer> analyzer_;
 };
 
 }  // namespace codegen
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index 0cc4b9cf3c21..e3be4c8c8b59 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -46,7 +46,7 @@ class StageInputReplacer : public IRMutator {
     Var new_var(it->second->var->name_hint + ".sync", op->type);
     inputs_.Set(new_var, it->second);
     replace_[op] = new_var;
-    return new_var;
+    return std::move(new_var);
   }
   Expr Mutate_(const Load* op, const Expr& e) final {
     CHECK(is_zero(op->index))
@@ -60,7 +60,7 @@ class StageInputReplacer : public IRMutator {
     Var data(it->second->var->name_hint + ".load.sync", op->type);
     inputs_.Set(data, it->second);
     replace_[op->buffer_var.get()] = data;
-    return data;
+    return std::move(data);
   }
   // inputs that get replaced.
   Map<Var, StageInput> inputs_;
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 2117d471eeee..56564d668001 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -400,6 +400,8 @@ void CodeGenHybrid::ReserveKeywords() {
   GetUniqueName("for");
   GetUniqueName("in");
   GetUniqueName("range");
+  GetUniqueName("True");
+  GetUniqueName("False");
   GetUniqueName("unroll");
   GetUniqueName("const_range");
   GetUniqueName("parallel");
@@ -434,6 +436,7 @@ void CodeGenHybrid::ReserveKeywords() {
   GetUniqueName("float32");
   GetUniqueName("float64");
   GetUniqueName("ceil_div");
+  GetUniqueName("max_num_threads");
 }
 
 void CodeGenHybrid::DumpStmt(const Stmt &stmt,
diff --git a/src/lang/data_layout.cc b/src/lang/data_layout.cc
new file mode 100644
index 000000000000..900a58029901
--- /dev/null
+++ b/src/lang/data_layout.cc
@@ -0,0 +1,322 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/lang/data_layout.cc
+ * \brief Data Layout expression.
+ */
+#include <tvm/data_layout.h>
+#include <tvm/ir_pass.h>
+
+namespace tvm {
+
+TVM_REGISTER_NODE_TYPE(LayoutNode);
+TVM_REGISTER_NODE_TYPE(BijectiveLayoutNode);
+
+const LayoutAxis LayoutAxis::UPPER_CASE[] = {
+  LayoutAxis('A'), LayoutAxis('B'), LayoutAxis('C'), LayoutAxis('D'), LayoutAxis('E'),
+  LayoutAxis('F'), LayoutAxis('G'), LayoutAxis('H'), LayoutAxis('I'), LayoutAxis('J'),
+  LayoutAxis('K'), LayoutAxis('L'), LayoutAxis('M'), LayoutAxis('N'), LayoutAxis('O'),
+  LayoutAxis('P'), LayoutAxis('Q'), LayoutAxis('R'), LayoutAxis('S'), LayoutAxis('T'),
+  LayoutAxis('U'), LayoutAxis('V'), LayoutAxis('W'), LayoutAxis('X'), LayoutAxis('Y'),
+  LayoutAxis('Z')
+};
+
+const LayoutAxis LayoutAxis::LOWER_CASE[] = {
+  LayoutAxis('a'), LayoutAxis('b'), LayoutAxis('c'), LayoutAxis('d'), LayoutAxis('e'),
+  LayoutAxis('f'), LayoutAxis('g'), LayoutAxis('h'), LayoutAxis('i'), LayoutAxis('j'),
+  LayoutAxis('k'), LayoutAxis('l'), LayoutAxis('m'), LayoutAxis('n'), LayoutAxis('o'),
+  LayoutAxis('p'), LayoutAxis('q'), LayoutAxis('r'), LayoutAxis('s'), LayoutAxis('t'),
+  LayoutAxis('u'), LayoutAxis('v'), LayoutAxis('w'), LayoutAxis('x'), LayoutAxis('y'),
+  LayoutAxis('z')
+};
+
+const LayoutAxis& LayoutAxis::Get(const char name) {
+  CHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
+    << "Invalid layout axis name: " << name << ". Has to be A-Z or a-z.";
+  return (name >= 'A' && name <= 'Z') ?
+         LayoutAxis::UPPER_CASE[name-'A'] :
+         LayoutAxis::LOWER_CASE[name-'a'];
+}
+
+const LayoutAxis& LayoutAxis::Get(const IterVar& itvar) {
+  const std::string axis = itvar->var.get()->name_hint;
+  CHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
+  return LayoutAxis::Get(axis[0]);
+}
+
+const LayoutAxis& LayoutAxis::make(const std::string& name) {
+  CHECK_EQ(name.length(), 1) << "Invalid axis " << name;
+  return LayoutAxis::Get(name[0]);
+}
+
+Layout::Layout(const Array<IterVar>& axes) {
+  node_ = make_node<LayoutNode>();
+  LayoutNode *node = operator->();
+  node->axes = axes;
+  std::ostringstream repr;
+  for (const IterVar& axis : axes) {
+    if (const auto* factor = axis->dom->extent.as<IntImm>()) {
+      CHECK_GT(factor->value, 0);
+      repr << factor->value;
+    }
+    CHECK_EQ(axis->var.get()->name_hint.size(), 1) << "Invalid layout axis "
+                                                   << axis->var.get()->name_hint;
+    char c = axis->var.get()->name_hint[0];
+    CHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
+    repr << axis->var.get()->name_hint;
+  }
+  node->name = repr.str();
+}
+
+Layout::Layout(const std::string& name) { // NOLINT(*)
+  if (name.empty() || name == "__undef__") return;
+
+  node_ = make_node<LayoutNode>();
+  LayoutNode *node = operator->();
+  node->name = name;
+
+  // parse layout string
+  int32_t factor = 0;
+  for (char c : name) {
+    if (c >= 'A' && c <= 'Z') {
+      CHECK_EQ(factor, 0) << "Invalid layout " << name
+                          << ": invalid factor size " << factor
+                          << " before dimension " << c;
+      std::string shape_name("_shape");
+      shape_name.insert(0, 1, c);
+      IterVar axis = IterVarNode::make(Range(Expr(0), Var(shape_name)),
+                                       Var(std::string(1, c)), kDataPar);
+      node->axes.push_back(axis);
+    } else if (c >= 'a' && c <= 'z') {
+      CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size "
+                          << factor << " for dimension " << c;
+      IterVar axis = IterVarNode::make(Range(Expr(0), Expr(factor)),
+                                       Var(std::string(1, c)), kDataPar);
+      node->axes.push_back(axis);
+      factor = 0;
+    } else if (c >= '0' && c <= '9') {
+      CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
+      factor = factor * 10 + c - '0';
+    } else {
+      LOG(FATAL) << "Invalid layout " << name;
+    }
+  }
+
+  // validate layout
+  std::vector<bool> exist_axis(256, false);
+  for (const IterVar& v : node->axes) {
+    auto axis_str = v->var.get()->name_hint;
+    CHECK_EQ(axis_str.size(), 1);
+    char axis = axis_str[0];
+    CHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
+    CHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
+    exist_axis[axis] = true;
+  }
+  for (const IterVar& v : node->axes) {
+    char axis = v->var.get()->name_hint[0];
+    if (axis >= 'a' && axis <= 'z') {
+      CHECK(exist_axis[axis-'a'+'A']) << "Invalid layout " << name << ": missing axis "
+                                      << axis - 'a' + 'A';
+    }
+  }
+}
+
+Layout LayoutNode::make(const std::string& layout) {
+  return Layout(layout);
+}
+
+Layout Layout::SubLayout(size_t pos, size_t len) const {
+  if (!defined() || pos > ndim()) return Layout::Undef();
+  if (pos + len > ndim()) len = ndim() - pos;
+  Array<IterVar> new_layout;
+  const auto axes = operator->()->axes;
+  for (size_t i = pos; i < pos + len; ++i) {
+    new_layout.push_back(axes[i]);
+  }
+  return Layout(new_layout);
+}
+
+Layout Layout::Split(const LayoutAxis &axis, size_t target_pos, int32_t factor) const {
+  if (!defined()) return Layout::Undef();
+  const std::string& name = operator->()->name;
+  const auto axes = operator->()->axes;
+  CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                    << target_pos << " for layout " << name;
+  CHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
+  CHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
+  CHECK(!this->Contains(axis.ToSubordinate())) << "Axis " << axis
+                                                << " has already been split in " << name;
+  CHECK(factor > 0) << "Invalid split size " << factor;
+  Array<IterVar> new_layout;
+  for (size_t i = 0; i <= this->ndim(); ++i) {
+    if (i == target_pos) {
+      new_layout.push_back(IterVarNode::make(Range(Expr(0), Expr(factor)),
+                                             Var(axis.ToSubordinate().name()), kDataPar));
+    }
+    if (i == this->ndim()) break;
+    new_layout.push_back(axes[i]);
+  }
+  return Layout(new_layout);
+}
+
+int32_t Layout::FactorOf(const LayoutAxis& axis) const {
+  if (!defined()) return -1;
+  const LayoutAxis& sub = axis.ToSubordinate();
+  if (!this->defined()) return -1;
+  for (const IterVar& itvar : operator->()->axes) {
+    if (sub == LayoutAxis::Get(itvar)) {
+      const auto* factor = itvar->dom->extent.as<IntImm>();
+      CHECK(factor);
+      return factor->value;
+    }
+  }
+  return -1;
+}
+
+inline bool GetStoreRule(Array<Expr>* rule,
+                         const Layout& src_layout,
+                         const Layout& dst_layout) {
+  for (size_t i = 0; i < dst_layout.ndim(); ++i) {
+    const auto& store_axis = dst_layout[i];
+    const IterVar& store_axis_impl = dst_layout->axes[i];
+    Expr store(0);
+
+    for (size_t j = 0; j < src_layout.ndim(); ++j) {
+      const auto& orig_axis = src_layout[j];
+      const IterVar& orig_axis_impl = src_layout->axes[j];
+      if (store_axis.ToPrimal() == orig_axis.ToPrimal()) {
+        if (orig_axis.IsPrimal()) {
+          Expr orig_var = orig_axis_impl->var;
+          const int32_t factor = src_layout.FactorOf(orig_axis);
+          if (factor > 0) {
+            orig_var = orig_var * Expr(factor);
+          }
+          store = store + orig_var;
+        } else {
+          store = store + orig_axis_impl->var;
+        }
+      }
+    }
+    if (is_zero(store)) {
+      // Not convertible
+      return false;
+    }
+
+    if (store_axis.IsPrimal()) {
+      const int32_t factor = dst_layout.FactorOf(store_axis);
+      if (factor > 0) {
+        store = store / Expr(factor);
+      }
+    } else {
+      store = store % store_axis_impl->dom->extent;
+    }
+
+    rule->push_back(store);
+  }
+  return true;
+}
+
+inline Array<Expr> TransformIndex(const Array<Expr>& src_index,
+                                  const Array<IterVar>& src_axis,
+                                  const Array<Expr>& transform_rule) {
+  Array<Expr> result;
+  std::unordered_map<const Variable*, Expr> bind_map;
+  for (size_t i = 0; i < src_index.size(); ++i) {
+    bind_map[src_axis[i]->var.get()] = src_index[i];
+  }
+  for (Expr rule : transform_rule) {
+    result.push_back(ir::Simplify(ir::Substitute(rule, bind_map)));
+  }
+  return result;
+}
+
+Array<Expr> BijectiveLayout::ForwardIndex(const Array<Expr>& src_index) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  CHECK_EQ(src_index.size(), self->src_layout->axes.size())
+    << "Input mismatch with layout " << self->src_layout;
+  return TransformIndex(src_index, self->src_layout->axes, self->forward_rule);
+}
+
+
+Array<Expr> BijectiveLayout::BackwardIndex(const Array<Expr>& dst_index) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  CHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
+    << "Output mismatch with layout " << self->dst_layout;
+  return TransformIndex(dst_index, self->dst_layout->axes, self->backward_rule);
+}
+
+inline Array<Expr> TransformShape(const Array<Expr>& src_shape,
+                                  const Array<IterVar>& src_axis,
+                                  const Array<IterVar>& target_axis,
+                                  const Array<Expr>& transform_rule) {
+  CHECK_EQ(src_shape.size(), src_axis.size());
+  // bind variables for original axes
+  // for major-axis, bind the corresponding size
+  // for minor-axis, simply bind it as 0, so that we can reuse forward/backward_rule,
+  // e.g., (C * 16 + c) / 32
+  std::unordered_map<const Variable*, Expr> bind_map;
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    Expr orig_shape = src_shape[i];
+    IterVar orig_axis = src_axis[i];
+    if (!LayoutAxis::Get(orig_axis).IsPrimal()) {
+      if (orig_shape.defined()) {
+        const auto* orig_shape_const = orig_shape.as<IntImm>();
+        const auto* orig_axis_extent = orig_axis->dom->extent.as<IntImm>();
+        CHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
+          << "Input shape mismatch at index " << i << ". Expected "
+          << orig_axis->dom->extent << ", get " << orig_shape;
+      }
+      bind_map[orig_axis->var.get()] = Expr(0);
+    } else {
+      bind_map[orig_axis->var.get()] = orig_shape;
+    }
+  }
+  // infer the target shape,
+  // for major-axis, use the forward/backward_rule directly,
+  // for minor-axis, simply use the extent.
+  Array<Expr> result;
+  CHECK_EQ(transform_rule.size(), target_axis.size());
+  for (size_t i = 0; i < transform_rule.size(); ++i) {
+    Expr rule = transform_rule[i];
+    IterVar axis = target_axis[i];
+    if (!LayoutAxis::Get(axis).IsPrimal()) {
+      result.push_back(axis->dom->extent);
+    } else {
+      result.push_back(ir::Simplify(ir::Substitute(rule, bind_map)));
+    }
+  }
+  return result;
+}
+
+Array<Expr> BijectiveLayout::ForwardShape(const Array<Expr>& shape) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  return TransformShape(shape, self->src_layout->axes,
+                        self->dst_layout->axes, self->forward_rule);
+}
+
+Array<Expr> BijectiveLayout::BackwardShape(const Array<Expr>& shape) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  return TransformShape(shape, self->dst_layout->axes,
+                        self->src_layout->axes, self->backward_rule);
+}
+
+BijectiveLayout BijectiveLayoutNode::make(const Layout& src_layout,
+                                          const Layout& dst_layout) {
+  auto n = make_node<BijectiveLayoutNode>();
+
+  n->src_layout = src_layout;
+  n->dst_layout = dst_layout;
+
+  if (!GetStoreRule(&n->forward_rule, n->src_layout, n->dst_layout)) {
+    // not convertible
+    return BijectiveLayout();
+  }
+  CHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
+
+  return BijectiveLayout(n);
+}
+
+}  // namespace tvm
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 7ac0e372371c..3bf8fc9191fb 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -5,7 +5,7 @@
 #include <tvm/base.h>
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <ir/IRPrinter.h>
 #include <memory>
 
diff --git a/src/lang/ir_operator.cc b/src/lang/expr_operator.cc
similarity index 58%
rename from src/lang/ir_operator.cc
rename to src/lang/expr_operator.cc
index beceb094c620..edbe0be3d5c5 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -1,28 +1,16 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file ir_operator.cc
+ * \file expr_operator.cc
  */
 #include <tvm/base.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <cmath>
+// Centralized header for constant folders.
+#include "../arithmetic/const_fold.h"
 
 namespace tvm {
 
-/*!
- * \brief Check whether type is used to represent index.
- *
- * Index types are frequently used in shape computation
- * and need to be aggressively constant-folded.
- *
- * \param type The type to represent index.
- * \return the checked result.
- */
-inline bool IsIndexType(const Type& type) {
-  return type.is_int() && type.lanes() == 1 &&
-      (type.bits() == 32 || type.bits() == 64);
-}
-
 // simple cast that only checks if type matches and cast
 inline Expr SimpleCast(const Type& t, Expr value) {
   if (value.type() == t) return value;
@@ -135,45 +123,14 @@ Expr reinterpret(const Type& t, Expr value) {
   return ir::Call::make(t, ir::Call::reinterpret, { value }, ir::Call::PureIntrinsic);
 }
 
-#define TVM_INDEX_CONST_PROPAGATION(BODY)                               \
-  using ir::IntImm;                                                     \
-  using ir::UIntImm;                                                    \
-  const IntImm* pa = a.as<IntImm>();                                    \
-  const IntImm* pb = b.as<IntImm>();                                    \
-  const Type& ta = a.type();                                            \
-  const Type& tb = b.type();                                            \
-  if (IsIndexType(ta) && IsIndexType(tb)) {                             \
-    BODY;                                                               \
-  }                                                                     \
-  BinaryOpMatchTypes(a, b);
-
-#define TVM_ARITH_CONST_PROPAGATION(BODY)                               \
-  using ir::IntImm;                                                     \
-  using ir::UIntImm;                                                    \
-  using ir::FloatImm;                                                   \
-  BinaryOpMatchTypes(a, b);                                             \
-  const IntImm* pa = a.as<IntImm>();                                    \
-  const IntImm* pb = b.as<IntImm>();                                    \
-  const FloatImm* fa = a.as<FloatImm>();                                \
-  const FloatImm* fb = b.as<FloatImm>();                                \
-  BODY;
-
-
 Expr operator+(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
-      if (pa && pa->value == 0) return SimpleCast(rtype, b);
-      if (pb && pb->value == 0) return SimpleCast(rtype, a);
-      if (fa && fb) return FloatImm::make(rtype, fa->value + fb->value);
-      if (fa && fa->value == 0) return SimpleCast(rtype, b);
-      if (fb && fb->value == 0) return SimpleCast(rtype, a);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Add>(a, b);
+  if (ret.defined()) return ret;
   return ir::Add::make(a, b);
 }
 
+// negation
 Expr operator-(Expr a) {
   using ir::IntImm;
   using ir::FloatImm;
@@ -185,114 +142,44 @@ Expr operator-(Expr a) {
 }
 
 Expr operator-(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
-      if (pb && pb->value == 0) return SimpleCast(rtype, a);
-      if (fa && fb) return FloatImm::make(rtype, fa->value - fb->value);
-      if (fb && fb->value == 0) return SimpleCast(rtype, a);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Sub>(a, b);
+  if (ret.defined()) return ret;
   return ir::Sub::make(a, b);
 }
 
 Expr operator*(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
-      if (pa) {
-        if (pa->value == 1) return SimpleCast(rtype, b);
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return SimpleCast(rtype, a);
-        if (pb->value == 0) return SimpleCast(rtype, b);
-      }
-      if (fa && fb) return FloatImm::make(rtype, fa->value * fb->value);
-      if (fa) {
-        if (fa->value == 1) return SimpleCast(rtype, b);
-        if (fa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (fb) {
-        if (fb->value == 1) return SimpleCast(rtype, a);
-        if (fb->value == 0) return SimpleCast(rtype, b);
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Mul>(a, b);
+  if (ret.defined()) return ret;
   return ir::Mul::make(a, b);
 }
 
 Expr operator/(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      // due to division and mod can have different modes
-      // only constant fold positive number where rule is fixed.
-      if (pa && pb && pa->value >= 0 && pb->value > 0) {
-        return IntImm::make(rtype, pa->value / pb->value);
-      }
-      if (pa) {
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return SimpleCast(rtype, a);
-        CHECK_NE(pb->value, 0) << "Divide by zero";
-      }
-      if (fa && fb && fb->value != 0) {
-        return FloatImm::make(rtype, fa->value / fb->value);
-      }
-      if (fa && fa->value == 0) {
-        return SimpleCast(rtype, a);
-      }
-      if (fb) {
-        if (fb->value == 1) return SimpleCast(rtype, a);
-        CHECK_NE(fb->value, 0) << "Divide by zero";
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Div>(a, b);
+  if (ret.defined()) return ret;
   return ir::Div::make(a, b);
 }
 
 Expr operator%(Expr a, Expr b) {
-  TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      // due to division and mod can have different modes
-      // only constant fold positive number where rule is fixed.
-      if (pa && pb && pa->value >= 0 && pb->value > 0) {
-        return IntImm::make(rtype, pa->value % pb->value);
-      }
-      if (pa) {
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return make_zero(rtype);
-        CHECK_NE(pb->value, 0) << "Divide by zero";
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Mod>(a, b);
+  if (ret.defined()) return ret;
   return ir::Mod::make(a, b);
 }
 
 Expr min(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
-      if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Min>(a, b);
+  if (ret.defined()) return ret;
   return ir::Min::make(a, b);
 }
 
 Expr max(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
-      if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Max>(a, b);
+  if (ret.defined()) return ret;
   return ir::Max::make(a, b);
 }
 
@@ -328,129 +215,116 @@ Expr likely(Expr cond) {
 }
 
 Expr operator>(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value > fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::GT>(a, b);
+  if (ret.defined()) return ret;
   return ir::GT::make(a, b);
 }
 
 Expr operator>=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value >= fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::GE>(a, b);
+  if (ret.defined()) return ret;
   return ir::GE::make(a, b);
 }
 
 Expr operator<(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value < fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::LT>(a, b);
+  if (ret.defined()) return ret;
   return ir::LT::make(a, b);
 }
 
 Expr operator<=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value <= fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::LE>(a, b);
+  if (ret.defined()) return ret;
   return ir::LE::make(a, b);
 }
 
 Expr operator==(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value == fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::EQ>(a, b);
+  if (ret.defined()) return ret;
   return ir::EQ::make(a, b);
 }
 
 Expr operator!=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value != fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::NE>(a, b);
+  if (ret.defined()) return ret;
   return ir::NE::make(a, b);
 }
 
 Expr operator&&(Expr a, Expr b) {
-  using ir::UIntImm;
-  if (a.type().is_bool() && b.type().is_bool()) {
-    const UIntImm* pa = a.as<UIntImm>();
-    const UIntImm* pb = b.as<UIntImm>();
-    if (pa && pa->value) return b;
-    if (pa && !pa->value) return a;
-    if (pb && pb->value) return a;
-    if (pb && !pb->value) return b;
-  }
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::And>(a, b);
+  if (ret.defined()) return ret;
   return ir::And::make(a, b);
 }
 
 Expr operator||(Expr a, Expr b) {
-  using ir::UIntImm;
-  if (a.type().is_bool() && b.type().is_bool()) {
-    const UIntImm* pa = a.as<UIntImm>();
-    const UIntImm* pb = b.as<UIntImm>();
-    if (pa && pa->value) return a;
-    if (pa && !pa->value) return b;
-    if (pb && pb->value) return b;
-    if (pb && !pb->value) return a;
-  }
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::Or>(a, b);
+  if (ret.defined()) return ret;
   return ir::Or::make(a, b);
 }
 
 Expr operator!(Expr a) {
-  using ir::UIntImm;
-  const UIntImm* pa = a.as<UIntImm>();
-  if (pa) {
-    return UIntImm::make(UInt(1), !(pa->value));
-  }
+  CHECK(a.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::Not>(a);
+  if (ret.defined()) return ret;
   return ir::Not::make(a);
 }
 
 Expr operator>>(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value >> pb->value));
       if (pb) {
-        if (pb->value == 0) return SimpleCast(rtype, a);
+        if (pb->value == 0) return a;
       }
     });
   return ir::Call::make(a.type(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator<<(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value << pb->value));
       if (pb) {
-        if (pb->value == 0) return SimpleCast(rtype, a);
+        if (pb->value == 0) return a;
       }
     });
   return ir::Call::make(a.type(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator&(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value & pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator|(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value | pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator^(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value ^ pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index a6dd39f79b1f..d5fc32ca0ff4 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -40,7 +40,7 @@ int ComputeOpNode::num_outputs() const {
   return body.size();
 }
 
-Array<IterVar> ComputeOpNode::root_iter_vars() const {
+Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
   if (reduce_axis.size() == 0) return axis;
   Array<IterVar> ret = axis;
   for (IterVar iv : reduce_axis) {
@@ -54,15 +54,15 @@ Type ComputeOpNode::output_dtype(size_t idx) const {
   return body[idx].type();
 }
 
-Array<Expr> ComputeOpNode::output_shape(size_t idx) const {
+Array<Expr> BaseComputeOpNode::output_shape(size_t idx) const {
   CHECK_LT(idx, num_outputs());
-  // for now, all outputs of ComputeOp have the same shape
-  std::vector<Expr> shape;
-  for (size_t i = 0; i < axis.size(); ++i) {
-    const Range& r = axis[i]->dom;
+  // for now, all outputs of a BaseComputeOp have the same shape
+  Array<Expr> shape;
+  for (const auto& ivar : this->axis) {
+    const Range& r = ivar->dom;
     shape.push_back(r->extent);
   }
-  return Array<Expr>(shape);
+  return shape;
 }
 
 Tensor compute(Array<Expr> shape,
@@ -208,7 +208,7 @@ void ComputeOpNode::PropBoundToInputs(
   for (auto& e : body) ir::PostOrderVisit(e, fvisit);
 }
 
-void ComputeOpNode::GatherBound(
+void BaseComputeOpNode::GatherBound(
     const Operation& self,
     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
     std::unordered_map<IterVar, Range>* out_dom_map) const {
@@ -225,22 +225,22 @@ void ComputeOpNode::GatherBound(
   }
 }
 
-Stmt ComputeOpNode::BuildRealize(
+Stmt BaseComputeOpNode::BuildRealize(
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& realize_map,
-    const Stmt& realize_body) const {
+    const Stmt& body) const {
   CHECK_EQ(stage->op.get(), this);
   HalideIR::Internal::Region bounds;
   for (IterVar iv : this->axis) {
     bounds.push_back(realize_map.at(iv));
   }
-  Stmt realize = realize_body;
+  Stmt realize = body;
   for (int i = this->num_outputs(); i > 0; --i) {
     Tensor t = stage->op.output(i-1);
     realize = ir::Realize::make(t->op, t->value_index,
       t->dtype, bounds, const_true(), realize);
     // alignment requirement, only useful for compute
-    for (size_t i = 0; i < this->axis.size(); ++i) {
+    for (size_t i = 0; i < num_schedulable_dims(); ++i) {
       auto it = stage->iter_var_attrs.find(this->axis[i]);
       if (it != stage->iter_var_attrs.end()) {
         IterVarAttr attr = (*it).second;
@@ -259,6 +259,10 @@ Stmt ComputeOpNode::BuildRealize(
   return realize;
 }
 
+size_t ComputeOpNode::num_schedulable_dims() const {
+  return axis.size();
+}
+
 // Build a reduction body.
 void MakeReduction(const ComputeOpNode* op,
                    const Array<Tensor>& tensors,
@@ -414,7 +418,7 @@ Stmt ComputeOpNode::BuildProvide(
 }
 
 ComputeLoopNest ComputeLoopNest::make(
-    const ComputeOpNode* self,
+    const BaseComputeOpNode* self,
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& dom_map,
     bool debug_keep_trivial_loop) {
@@ -440,8 +444,8 @@ ComputeLoopNest ComputeLoopNest::make(
     for (IterVar iv : self->reduce_axis) {
       update_state[iv] = 2;
     }
-    for (IterVar iv : self->axis) {
-      update_state[iv] = 1;
+    for (size_t i = 0; i < self->num_schedulable_dims(); ++i) {
+      update_state[self->axis[i]] = 1;
     }
     // find which iter var is related to reduction and which is related to axis.
     schedule::PassDownBitMaskOr(stage, &update_state);
diff --git a/src/op/compute_op.h b/src/op/compute_op.h
index 87b0814c1ad9..b0264835da5f 100644
--- a/src/op/compute_op.h
+++ b/src/op/compute_op.h
@@ -41,7 +41,7 @@ struct ComputeLoopNest {
    * \return The constructed loop nest
    */
   static ComputeLoopNest make(
-      const ComputeOpNode* self,
+      const BaseComputeOpNode* self,
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop);
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 0268498c7db2..31c45258abc8 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -7,8 +7,8 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
 #include <tvm/ir_pass.h>
+#include <tvm/expr_operator.h>
 #include <ir/Expr.h>
 #include <unordered_set>
 #include <string>
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
index 0262db7d8fc5..3ccce0c5d38a 100644
--- a/src/op/tensor_compute_op.cc
+++ b/src/op/tensor_compute_op.cc
@@ -28,27 +28,10 @@ int TensorComputeOpNode::num_outputs() const {
   return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
 }
 
-Array<IterVar> TensorComputeOpNode::root_iter_vars() const {
-  Array<IterVar> ret = axis;
-  for (IterVar iv : reduce_axis) {
-    ret.push_back(iv);
-  }
-  return ret;
-}
-
 Type TensorComputeOpNode::output_dtype(size_t i) const {
   return this->intrin->buffers[this->inputs.size() + i]->dtype;
 }
 
-Array<Expr> TensorComputeOpNode::output_shape(size_t i) const {
-  Array<Expr> shape;
-  for (const auto& ivar : this->axis) {
-    shape.push_back(ivar->dom->extent);
-  }
-  return shape;
-}
-
-
 Operation TensorComputeOpNode::make(std::string name,
                                     std::string tag,
                                     Array<IterVar> axis,
@@ -121,123 +104,10 @@ void TensorComputeOpNode::PropBoundToInputs(
   }
 }
 
-void TensorComputeOpNode::GatherBound(
-    const Operation& self,
-    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-    std::unordered_map<IterVar, Range>* out_dom_map) const {
-  const TensorDom& tdom = tensor_dom.at(self.output(0));
-  for (size_t i = 0; i < this->axis.size(); ++i) {
-    Range r = arith::Union(tdom.data.at(i)).cover_range(this->axis[i]->dom);
-    CHECK(!out_dom_map->count(this->axis[i]));
-    (*out_dom_map)[this->axis[i]] = r;
-  }
-  for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
-    CHECK(!out_dom_map->count(this->reduce_axis[i]));
-    (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
-  }
-}
-
-Stmt TensorComputeOpNode::BuildRealize(
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& realize_map,
-    const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
-  HalideIR::Internal::Region bounds;
-  for (IterVar iv : this->axis) {
-    bounds.push_back(realize_map.at(iv));
-  }
-  Stmt realize = body;
-  for (int i = this->num_outputs(); i > 0; --i) {
-    Tensor t = stage->op.output(i-1);
-    realize = ir::Realize::make(t->op, t->value_index,
-      t->dtype, bounds, const_true(), realize);
-    // alignment requirement, only useful for compute
-    for (int i = 0; i < schedulable_ndim; ++i) {
-      auto it = stage->iter_var_attrs.find(this->axis[i]);
-      if (it != stage->iter_var_attrs.end()) {
-        IterVarAttr attr = (*it).second;
-        if (attr->dim_align_factor != 0) {
-          Array<Expr> tuple = {static_cast<int>(i),
-                               attr->dim_align_factor,
-                               attr->dim_align_offset};
-          realize = ir::AttrStmt::make(
-              t, ir::attr::buffer_dim_align,
-              Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
-              realize);
-        }
-      }
-    }
-  }
-  return realize;
-}
-
-ComputeLoopNest MakeLoopNest(
-    const TensorComputeOpNode* self,
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map,
-    bool debug_keep_trivial_loop) {
-  CHECK_EQ(stage->op.operator->(), self);
-  ComputeLoopNest ret;
-  // make main loop nest
-  ret.main_nest = op::MakeLoopNest(
-      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
-      debug_keep_trivial_loop);
-  ret.main_predicates = schedule::MakeBoundCheck(
-      stage, dom_map, ret.main_vmap, false,
-      std::unordered_set<IterVar>());
-  for (auto& e : ret.main_predicates) {
-    e = likely(e);
-  }
-  if (stage->store_predicate.defined()) {
-    ret.main_predicates.push_back(stage->store_predicate);
-  }
-  if (self->reduce_axis.size() != 0) {
-    // try to find the location to insert the initialization.
-    // Fuse the initialization and provide loop when possible.
-    std::unordered_map<IterVar, int> update_state;
-    for (IterVar iv : self->reduce_axis) {
-      update_state[iv] = 2;
-    }
-    for (int i = 0; i < self->schedulable_ndim; ++i) {
-      update_state[self->axis[i]] = 1;
-    }
-    // find which iter var is related to reduction and which is related to axis.
-    schedule::PassDownBitMaskOr(stage, &update_state);
-    auto leaf_iter_vars = stage->leaf_iter_vars;
-    // first first loop that is related to reduction.
-    size_t begin_loop = leaf_iter_vars.size();
-    for (size_t i = 0; i < leaf_iter_vars.size(); ++i) {
-      auto iv = leaf_iter_vars[i];
-      int flag = update_state.at(iv);
-      if ((flag & 2) != 0) {
-        begin_loop = i; break;
-      }
-      ret.init_vmap[iv] = ret.main_vmap.at(iv);
-    }
-    ret.num_common_loop = begin_loop;
-    // skip loops that are related to reduction and are unrelated to axis.
-    std::unordered_set<IterVar> skip_iter;
-    for (auto kv : update_state) {
-      int flag = kv.second;
-      if (flag == 2) skip_iter.insert(kv.first);
-    }
-    ret.init_nest = op::MakeLoopNest(
-        stage, dom_map, begin_loop, true,
-        skip_iter, &(ret.init_vmap), debug_keep_trivial_loop);
-    ret.init_predicates = schedule::MakeBoundCheck(
-        stage, dom_map, ret.init_vmap, true, skip_iter);
-    for (auto& e : ret.init_predicates) {
-      e = likely(e);
-    }
-  } else {
-    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-    ret.num_common_loop = stage->leaf_iter_vars.size();
-  }
-  // copy elison here.
-  return ret;
+size_t TensorComputeOpNode::num_schedulable_dims() const {
+  return schedulable_ndim;
 }
 
-
 Stmt TensorComputeOpNode::BuildProvide(
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& dom_map,
@@ -296,7 +166,7 @@ Stmt TensorComputeOpNode::BuildProvide(
   ir::ArgBinder binder(&vmap);
 
   size_t tloc = stage->leaf_iter_vars.size();
-  ComputeLoopNest n = MakeLoopNest(this, stage, dom_map, debug_keep_trivial_loop);
+  ComputeLoopNest n = ComputeLoopNest::make(this, stage, dom_map, debug_keep_trivial_loop);
 
   if (this->reduce_axis.size() == 0) {
     std::vector<std::vector<Stmt> > nest(
diff --git a/src/pass/combine_context_call.cc b/src/pass/combine_context_call.cc
index d60256bcfcf0..d3cbb2842134 100644
--- a/src/pass/combine_context_call.cc
+++ b/src/pass/combine_context_call.cc
@@ -39,7 +39,7 @@ class ContextCallCombiner final : public IRMutator {
         }
         Var ctx_var(name, ctx.type());
         ctx_map_[ctx] = ctx_var;
-        return ctx_var;
+        return std::move(ctx_var);
       }
     } else {
       return IRMutator::Mutate_(op, e);
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index 3cef4486ee1b..6af8421398de 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -7,7 +7,7 @@
 #define TVM_PASS_IR_UTIL_H_
 
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/runtime/device_api.h>
 #include <vector>
 
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index 1a9caf4b591e..82eabf09b9e3 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -50,7 +50,23 @@ class IntrinInjecter : public IRMutator {
     // on ARM.
     if (const Broadcast* bcast = e.as<Broadcast>()) {
       if (const Cast* cast = bcast->value.as<Cast>()) {
-        if (cast->type.bits() == cast->value.type().bits() * 2) {
+        auto should_swap = [&]() {
+          // Maintain behaviour (int8 -> int16, fp16 -> fp32).
+          if (cast->type.bits() == cast->value.type().bits() * 2) {
+            return true;
+          }
+          // Check both operands are integer-like.
+          if (!cast->type.is_uint() && !cast->type.is_int()) {
+            return false;
+          }
+          if (!cast->value.type().is_uint() && !cast->value.type().is_int()) {
+            return false;
+          }
+          // If both are integer-like, swap if we have a widening cast.
+          return cast->type.bits() > cast->value.type().bits();
+        };
+
+        if (should_swap()) {
           Expr new_bcast = Broadcast::make(cast->value, bcast->lanes);
           return Cast::make(bcast->type, new_bcast);
         }
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 41f92ad24085..d4f033143d6a 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -176,13 +176,13 @@ class DeviceTypeBinder: public IRMutator {
   explicit DeviceTypeBinder(int device_type)
       : device_type_(device_type) {}
 
-  Stmt Mutate_(const AttrStmt* op, const Stmt &s) final {
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
     if (op->attr_key == attr::device_context_type) {
       if (const Variable* var = op->value.as<Variable>()) {
-        std::unordered_map<const Variable*, Expr> dmap;
+        var_ = var;
         Expr value = make_const(op->value.type(), device_type_);
-        dmap[var] = value;
-        Stmt body = Substitute(s, dmap);
+        Stmt body = IRMutator::Mutate_(op, s);
+        var_ = nullptr;
         std::ostringstream os;
         os << "device_type need to be " << device_type_;
         return AssertStmt::make(op->value == value, os.str(), body);
@@ -191,7 +191,40 @@ class DeviceTypeBinder: public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
+  Stmt Mutate_(const IfThenElse* op, const Stmt& s) final {
+    // eager simplify if guard.
+    Stmt res = IRMutator::Mutate_(op, s);
+    op = res.as<IfThenElse>();
+    if (is_zero(op->condition)) {
+      if (op->else_case.defined()) return op->else_case;
+      return Evaluate::make(0);
+    }
+    if (is_one(op->condition)) {
+      return op->then_case;
+    }
+    return res;
+  }
+
+  Expr Mutate_(const NE* op, const Expr& e) final {
+    // eager check NE for device check
+    Expr res = IRMutator::Mutate_(op, e);
+    op = res.as<NE>();
+    if (ir::Equal(op->a, op->b)) {
+      return make_const(op->type, false);
+    }
+    return res;
+  }
+
+  Expr Mutate_(const Variable* op, const Expr& e) final {
+    if (op == var_) {
+      return make_const(op->type, device_type_);
+    } else {
+      return e;
+    }
+  }
+
  public:
+  const Variable* var_{nullptr};
   int device_type_;
 };
 
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 488d44544c31..12913dde95af 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -8,7 +8,7 @@
 #include <tvm/expr.h>
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/ir_pass.h>
 #include <tvm/buffer.h>
 #include <tvm/target_info.h>
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 9ba9dcde63c9..3f7fd9512eb2 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -936,10 +936,8 @@ class VectorAllocRewriter : public IRMutator {
         tvec[0].lanes() != op->type.lanes()) {
       int factor = tvec[0].lanes() / op->type.lanes();
       Array<Expr> extents = op->extents;
-      arith::ModularEntry me = EvalModular(
-          extents[extents.size() - 1],
-          std::unordered_map<const Variable*, arith::ModularEntry>());
-      if (me.base % factor == 0 && me.coeff % factor == 0) {
+      arith::ModularSet me = analyzer_.modular_set(extents[extents.size() - 1]);
+      if (me->base % factor == 0 && me->coeff % factor == 0) {
         extents.Set(extents.size() - 1,
                     extents[extents.size() - 1] / make_const(extents[0].type(), factor));
         return Allocate::make(
@@ -959,6 +957,8 @@ class VectorAllocRewriter : public IRMutator {
 
   // Internal access map
   std::unordered_map<const Variable*, std::vector<Type> > acc_map_;
+  // internal analyzer
+  arith::Analyzer analyzer_;
 };
 
 
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 4ef893f463e9..3128d2a71159 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -578,5 +578,10 @@ TVM_REGISTER_API("relay.backend.CreateInterpreter")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = CreateInterpreter(args[0], args[1], args[2]);
   });
+
+TVM_REGISTER_NODE_TYPE(ClosureNode);
+TVM_REGISTER_NODE_TYPE(TupleValueNode);
+TVM_REGISTER_NODE_TYPE(TensorValueNode);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
new file mode 100644
index 000000000000..87d3dd373e83
--- /dev/null
+++ b/src/relay/backend/param_dict.cc
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file param_dict.cc
+ * \brief Implementation and registration of parameter dictionary
+ * serializing/deserializing functions.
+ */
+#include "param_dict.h"
+
+#include <dmlc/memory_io.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    // `args` is in the form "key, value, key, value, ..."
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
+
+TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string bytes = args[0];
+    std::vector<std::string> names;
+    dmlc::MemoryStringStream memstrm(&bytes);
+    dmlc::Stream* strm = &memstrm;
+    uint64_t header, reserved;
+    CHECK(strm->Read(&header))
+        << "Invalid parameters file format";
+    CHECK(header == kTVMNDArrayListMagic)
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&reserved))
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&names))
+        << "Invalid parameters file format";
+    uint64_t sz;
+    strm->Read(&sz, sizeof(sz));
+    size_t size = static_cast<size_t>(sz);
+    CHECK(size == names.size())
+        << "Invalid parameters file format";
+    tvm::Array<NamedNDArray> ret;
+    for (size_t i = 0; i < size; ++i) {
+      tvm::runtime::NDArray temp;
+      temp.Load(strm);
+      auto n = tvm::make_node<NamedNDArrayNode>();
+      n->name = std::move(names[i]);
+      n->array = temp;
+      ret.push_back(NamedNDArray(n));
+    }
+    *rv = ret;
+  });
+
+TVM_REGISTER_NODE_TYPE(NamedNDArrayNode);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h
new file mode 100644
index 000000000000..0c32d2bf4742
--- /dev/null
+++ b/src/relay/backend/param_dict.h
@@ -0,0 +1,43 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file param_dict.h
+ * \brief Definitions for serializing and deserializing parameter dictionaries.
+ */
+#ifndef TVM_RELAY_BACKEND_PARAM_DICT_H_
+#define TVM_RELAY_BACKEND_PARAM_DICT_H_
+
+#include <tvm/node/node.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Magic number for NDArray list file  */
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+
+/*!
+ * \brief Wrapper node for naming `NDArray`s.
+ */
+struct NamedNDArrayNode : public ::tvm::Node {
+  std::string name;
+  tvm::runtime::NDArray array;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("array", &array);
+  }
+
+  static constexpr const char* _type_key = "NamedNDArray";
+  TVM_DECLARE_NODE_TYPE_INFO(NamedNDArrayNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(NamedNDArray, NamedNDArrayNode);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_PARAM_DICT_H_
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 96517f8dd445..aa9336b29153 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -217,20 +217,20 @@ class AlphaEqualHandler:
     return false;
   }
 
-  bool VisitType_(const GlobalTypeVarNode* op, const Type& t2) final {
-    return GetRef<Type>(op) == t2;
+  bool VisitType_(const GlobalTypeVarNode* lhs, const Type& other) final {
+    return GetRef<Type>(lhs) == other;
   }
 
-  bool VisitType_(const TypeCallNode* op, const Type& t2) final {
-    const TypeCallNode* pt = t2.as<TypeCallNode>();
-    if (pt == nullptr
-        || op->args.size() != pt->args.size()
-        || !TypeEqual(op->func, pt->func)) {
+  bool VisitType_(const TypeCallNode* lhs, const Type& other) final {
+    const TypeCallNode* rhs = other.as<TypeCallNode>();
+    if (rhs == nullptr
+        || lhs->args.size() != rhs->args.size()
+        || !TypeEqual(lhs->func, rhs->func)) {
       return false;
     }
 
-    for (size_t i = 0; i < op->args.size(); ++i) {
-      if (!TypeEqual(op->args[i], pt->args[i])) {
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      if (!TypeEqual(lhs->args[i], rhs->args[i])) {
         return false;
       }
     }
@@ -369,8 +369,8 @@ class AlphaEqualHandler:
     }
   }
 
-  bool VisitExpr_(const OpNode* op, const Expr& other) final {
-    return op == other.get();
+  bool VisitExpr_(const OpNode* lhs, const Expr& other) final {
+    return lhs == other.get();
   }
 
   bool VisitExpr_(const ConstantNode* lhs, const Expr& other) final {
@@ -389,80 +389,80 @@ class AlphaEqualHandler:
     }
   }
 
-  bool VisitExpr_(const RefCreateNode* op, const Expr& e2) final {
-    if (const RefCreateNode* nr = e2.as<RefCreateNode>()) {
-      return ExprEqual(op->value, nr->value);
+  bool VisitExpr_(const RefCreateNode* lhs, const Expr& other) final {
+    if (const RefCreateNode* rhs = other.as<RefCreateNode>()) {
+      return ExprEqual(lhs->value, rhs->value);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const RefReadNode* op, const Expr& e2) final {
-    if (const RefReadNode* r = e2.as<RefReadNode>()) {
-      return ExprEqual(op->ref, r->ref);
+  bool VisitExpr_(const RefReadNode* lhs, const Expr& other) final {
+    if (const RefReadNode* rhs = other.as<RefReadNode>()) {
+      return ExprEqual(lhs->ref, rhs->ref);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const RefWriteNode* op, const Expr& e2) final {
-    if (const RefWriteNode* r = e2.as<RefWriteNode>()) {
-      return ExprEqual(op->ref, r->ref) && ExprEqual(op->value, r->value);
+  bool VisitExpr_(const RefWriteNode* lhs, const Expr& other) final {
+    if (const RefWriteNode* rhs = other.as<RefWriteNode>()) {
+      return ExprEqual(lhs->ref, rhs->ref) && ExprEqual(lhs->value, rhs->value);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const ConstructorNode* op, const Expr& e2) final {
-    return GetRef<Expr>(op) == e2;
+  bool VisitExpr_(const ConstructorNode* lhs, const Expr& other) final {
+    return GetRef<Expr>(lhs) == other;
   }
 
-  bool ClauseEqual(const Clause& l, const Clause& r) {
-    return PatternEqual(l->lhs, r->lhs) && ExprEqual(l->rhs, r->rhs);
+  bool ClauseEqual(const Clause& lhs, const Clause& rhs) {
+    return PatternEqual(lhs->lhs, rhs->lhs) && ExprEqual(lhs->rhs, rhs->rhs);
   }
 
-  bool PatternEqual(const Pattern& l, const Pattern& r) {
-    return VisitPattern(l, r);
+  bool PatternEqual(const Pattern& lhs, const Pattern& rhs) {
+    return VisitPattern(lhs, rhs);
   }
 
-  bool VisitPattern_(const PatternWildcardNode* op, const Pattern& r) final {
-    return r.as<PatternWildcardNode>();
+  bool VisitPattern_(const PatternWildcardNode* lhs, const Pattern& other) final {
+    return other.as<PatternWildcardNode>();
   }
 
-  bool VisitPattern_(const PatternVarNode* op, const Pattern& e2) final {
-    if (const auto* r = e2.as<PatternVarNode>()) {
-      return MergeVarDecl(op->var, r->var);
+  bool VisitPattern_(const PatternVarNode* lhs, const Pattern& other) final {
+    if (const auto* rhs = other.as<PatternVarNode>()) {
+      return MergeVarDecl(lhs->var, rhs->var);
     }
     return false;
   }
 
-  bool VisitPattern_(const PatternConstructorNode* op, const Pattern& e2) final {
-    const auto* r = e2.as<PatternConstructorNode>();
-    if (r == nullptr
-        || !ExprEqual(op->constructor, r->constructor)
-        || op->patterns.size() != r->patterns.size()) {
+  bool VisitPattern_(const PatternConstructorNode* lhs, const Pattern& other) final {
+    const auto* rhs = other.as<PatternConstructorNode>();
+    if (rhs == nullptr
+        || !ExprEqual(lhs->constructor, rhs->constructor)
+        || lhs->patterns.size() != rhs->patterns.size()) {
       return false;
     }
 
-    for (size_t i = 0; i < op->patterns.size(); i++) {
-      if (!PatternEqual(op->patterns[i], r->patterns[i])) {
+    for (size_t i = 0; i < lhs->patterns.size(); i++) {
+      if (!PatternEqual(lhs->patterns[i], rhs->patterns[i])) {
         return false;
       }
     }
     return true;
   }
 
-  bool VisitExpr_(const MatchNode* op, const Expr& e2) final {
-    const MatchNode* r = e2.as<MatchNode>();
+  bool VisitExpr_(const MatchNode* lhs, const Expr& other) final {
+    const MatchNode* rhs = other.as<MatchNode>();
 
-    if (r == nullptr
-        || !ExprEqual(op->data, r->data)
-        || op->clauses.size() != r->clauses.size()) {
+    if (rhs == nullptr
+        || !ExprEqual(lhs->data, rhs->data)
+        || lhs->clauses.size() != rhs->clauses.size()) {
       return false;
     }
 
-    for (size_t i = 0; i < op->clauses.size(); ++i) {
-      if (!ClauseEqual(op->clauses[i], r->clauses[i])) {
+    for (size_t i = 0; i < lhs->clauses.size(); ++i) {
+      if (!ClauseEqual(lhs->clauses[i], rhs->clauses[i])) {
         return false;
       }
     }
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 6265873d8310..8d2163e0ecc8 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -364,7 +364,7 @@ class ExprBinder : public ExprMutator {
     if (it != args_map_.end()) {
       return (*it).second;
     } else {
-      return id;
+      return std::move(id);
     }
   }
 
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index da273265ae33..dc7b3074d2ef 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -83,6 +83,7 @@ void ModuleNode::Add(const GlobalVar& var,
     CHECK(AlphaEqual(type, old_type))
         << "Module#update changes type, not possible in this mode.";
   }
+  var->checked_type_ = type;
   AddUnchecked(var, checked_func);
 }
 
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index b88d0ee0e3ab..a05da3a980f4 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -192,7 +192,7 @@ class TypeBinder : public TypeMutator {
     if (it != args_map_.end()) {
       return (*it).second;
     } else {
-      return id;
+      return std::move(id);
     }
   }
 
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 4c9b0a5ca83e..4a5a7a86f1ea 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -4,13 +4,13 @@
  * \brief Property def of nn operators.
  */
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/debug.h>
 #include <topi/elemwise.h>
 #include <vector>
 #include "./type_relations.h"
 #include "./op_common.h"
-#include "./layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index e6efcb8ce459..d92e380fa9cc 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -3,11 +3,11 @@
  * \file resize.cc
  * \brief Image operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/image.h>
 #include <topi/elemwise.h>
 #include <topi/image/resize.h>
-#include "../layout.h"
 #include "../op_common.h"
 
 namespace tvm {
@@ -28,17 +28,18 @@ bool ResizeRel(const Array<Type>& types,
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.Convertible(kNCHW))
+  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(layout_converter.defined())
     << "Resize only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
-  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
-  oshape[2] = param->size[0];
-  oshape[3] = param->size[1];
+  auto oshape = layout_converter.ForwardShape(data->shape);
+  oshape.Set(2, param->size[0]);
+  oshape.Set(3, param->size[1]);
 
   // assign output type
   reporter->Assign(types[1],
-                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                   TensorTypeNode::make(layout_converter.BackwardShape(oshape),
                                         data->dtype));
   return true;
 }
diff --git a/src/relay/op/layout.cc b/src/relay/op/layout.cc
deleted file mode 100644
index 98fea55aa4c1..000000000000
--- a/src/relay/op/layout.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file src/relay/op/layout.cc
- * \brief Layout expression.
- */
-
-#include "layout.h"
-
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_NODE_TYPE(LayoutNode);
-
-std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  CHECK_EQ(src_layout.ndim(), src.size());
-  if (src_layout == dst_layout) {
-    return src;
-  } else if (!src_layout.defined()) {
-    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
-  } else if (!dst_layout.defined()) {
-    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
-  }
-
-  CHECK(src_layout.Convertible(dst_layout))
-    << "cannot convert from "
-    << src_layout << " to " << dst_layout;
-
-  std::vector<IndexExpr> dst(dst_layout.ndim());
-  for (size_t i = 0; i < src_layout.ndim(); ++i) {
-    Layout::LayoutDim src_dim = src_layout[i];
-    if (Layout::IsSuperdim(src_dim)) {
-      int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_dim));
-      int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_dim));
-      int src_minor_pos = src_layout.Indexof(Layout::ToSubdim(src_dim));
-      int src_factor = src_layout.Subsizeof(src_dim);
-      int dst_factor = dst_layout.Subsizeof(src_dim);
-      IndexExpr src_dim_size = src[i];
-
-      if (src_minor_pos >= 0) {
-        CHECK(is_const_int(src[src_minor_pos], src_factor))
-          << "src shape " << Array<IndexExpr>(src)
-          << " does not agree with layout "
-          << src_layout;
-        src_dim_size *= src_factor;
-      }
-      dst[dst_major_pos] = src_dim_size;
-      if (dst_minor_pos >= 0) {
-        CHECK_GT(dst_factor, 0);
-        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
-          CHECK_LE(dst_factor, const_src_dim_size[0])
-            << "Converting " << Array<IndexExpr>(src)
-            << " from " << src_layout
-            << " to " << dst_layout
-            << ": cannot split dimension size of "
-            << src_dim_size << " by " << dst_factor;
-        }
-        dst[dst_major_pos] /= dst_factor;
-        dst[dst_minor_pos] = dst_factor;
-      }
-    }
-  }
-  return dst;
-}
-
-std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  std::vector<IndexExpr> ret(src.size());
-  for (size_t i = 0; i < src.size(); ++i) {
-    ret[i] = src[i];
-  }
-  return ConvertLayout(ret, src_layout, dst_layout);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
deleted file mode 100644
index 09cf3a9cf780..000000000000
--- a/src/relay/op/layout.h
+++ /dev/null
@@ -1,432 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file relay/op/layout.h
- * \brief Layout expression.
- *
- *  This file is adapted from its nnvm counterpart and will keep involving
- *  to the new layout system
- *
- *  The layout is composed of upper cases, lower cases and numbers,
- *  where upper case indicates a (super-)dimension and
- *  the corresponding lower case with factor size indicates the split (sub-)dimension.
- *  For example, NCHW16c can describe a 5-D tensor of
- *  [batch_size, channel, height, width, channel_block].
- *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
- */
-#ifndef TVM_RELAY_OP_LAYOUT_H_
-#define TVM_RELAY_OP_LAYOUT_H_
-
-#include <tvm/base.h>
-#include <tvm/expr.h>
-#include <tvm/relay/base.h>
-
-#include <string>
-#include <sstream>
-#include <vector>
-#include <utility>
-#include <algorithm>
-
-namespace tvm {
-namespace relay {
-
-class LayoutNode : public Node {
- public:
-  std::string name;
-  Array<Integer> superdim_pos;
-  Array<Integer> subdim_pos;
-  Array<Integer> subdim_size;
-  Array<Integer> layout_simplified;
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("name", &name);
-    v->Visit("superdim_pos", &superdim_pos);
-    v->Visit("subdim_pos", &subdim_pos);
-    v->Visit("subdim_size", &subdim_size);
-    v->Visit("layout_simplified", &layout_simplified);
-  }
-
-  static constexpr const char* _type_key = "Layout";
-  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
-};
-
-class Layout : public NodeRef {
- public:
-  using LayoutDim = char;
-  static constexpr uint32_t kUniqueDim = 26;
-
-  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
-
-  /*! \brief default constructor */
-  Layout() : Layout("__undef__") {} // NOLINT(*)
-
-  /*! \brief construct from a string */
-  Layout(const char* name) : Layout(std::string(name)) {} // NOLINT(*)
-
-  /*!
-   * \brief construct from a string.
-   * \param layout input in layout convention:
-   *        upper case indicates a dimension and
-   *        the corresponding lower case with factor size
-   *        indicates the split dimension.
-   *        return undefined layout if "__undef__" is passed.
-   */
-  Layout(const std::string& name) { // NOLINT(*)
-    node_ = make_node<LayoutNode>();
-
-    std::vector<int32_t> superdim_pos(kUniqueDim, -1);
-    std::vector<int32_t> subdim_pos(kUniqueDim, -1);
-    std::vector<int32_t> subdim_size(kUniqueDim, -1);
-    std::vector<char> layout_simplified;
-
-    if (name != "__undef__") {  // parse layout string
-      int32_t factor = 0;
-      uint32_t curr = 0;
-      for (size_t i = 0; i < name.size(); ++i) {
-        const LayoutDim c = name.at(i);
-        if (IsSuperdim(c)) {
-          int pos = c - 'A';
-          CHECK_EQ(factor, 0) << "Invalid layout " << name
-                              << ": invalid factor size " << factor
-                              << " before dimension " << c;
-          CHECK_EQ(superdim_pos[pos], -1) << "Invalid layout " << name
-                                          << ": duplicate dimension " << c;
-          superdim_pos[pos] = curr++;
-          layout_simplified.push_back(c);
-        } else if (IsSubdim(c)) {
-          int pos = c - 'a';
-          CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size "
-                              << factor << " for dimension " << c;
-          CHECK_EQ(subdim_pos[pos], -1) << "Invalid layout " << name
-                                        << ": duplicate dimension " << c;
-          CHECK_EQ(subdim_size[pos], -1) << "Invalid layout " << name
-                                         << ": duplicate dimension " << c;
-          subdim_pos[pos] = curr++;
-          subdim_size[pos] = factor;
-          layout_simplified.push_back(c);
-          factor = 0;
-        } else if (c >= '0' && c <= '9') {
-          CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
-          factor = factor * 10 + c - '0';
-        } else {
-          LOG(FATAL) << "Invalid layout " << name;
-        }
-      }
-      for (LayoutDim dim : layout_simplified) {
-        CHECK(IsSuperdim(dim) || superdim_pos[dim-'a'] >= 0)
-          << "Invalid layout " << name << ": missing axis "
-          << static_cast<char>(dim - 'a' + 'A');
-      }
-    }
-
-    LayoutNode *node = operator->();
-    node->name = name;
-
-    for (uint32_t i = 0; i < kUniqueDim; ++i) {
-      node->superdim_pos.push_back(superdim_pos[i]);
-      node->subdim_pos.push_back(subdim_pos[i]);
-      node->subdim_size.push_back(subdim_size[i]);
-    }
-    for (LayoutDim dim : layout_simplified) {
-      node->layout_simplified.push_back(dim);
-    }
-  }
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  const LayoutNode* operator->() const {
-    return static_cast<const LayoutNode*>(node_.get());
-  }
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  LayoutNode* operator->() {
-    return static_cast<LayoutNode*>(node_.get());
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a super-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a super-dimension.
-   */
-  static bool IsSuperdim(LayoutDim dim) {
-    return dim >= 'A' && dim <= 'Z';
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a sub-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a sub-dimension.
-   */
-  static bool IsSubdim(LayoutDim dim) {
-    return dim >= 'a' && dim <= 'z';
-  }
-
-  /*!
-   * \brief Convert a given dimension to super-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim ToSuperdim(LayoutDim dim) {
-    if (IsSubdim(dim)) {
-      return dim - 'a' + 'A';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Convert a given dimension to sub-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim ToSubdim(LayoutDim dim) {
-    if (IsSuperdim(dim)) {
-      return dim - 'A' + 'a';
-    }
-    return dim;
-  }
-
-  /*!
- * \brief Return an undefined layout.
- * \return a (global) undefined layout.
- */
-  static const Layout& Undef() {
-    static Layout undef;
-    return undef;
-  }
-
-  /*!
-   * \brief Two layouts are convertible only if
-   *        they have same set of super-dimensions.
-   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
-   *        but NCHW, CHW, OIHW are not.
-   * \param dst the target layout
-   * \return Whether can be converted to dst layout.
-   */
-  bool Convertible(const Layout &dst) const {
-    const LayoutNode *n = operator->();
-    if (!this->defined() || !dst.defined()) return false;
-    for (size_t i = 0; i < kUniqueDim; ++i) {
-      if ((n->superdim_pos[i]->value >= 0 && dst->superdim_pos[i]->value < 0) ||
-          (n->superdim_pos[i]->value < 0 && dst->superdim_pos[i]->value >= 0)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Returns a sublayout which is the portion of the object
-   *        that starts at dimension \p pos and spans \p len dimensions
-   *        (or until the end of the layout, whichever comes first).
-   * \param pos The start position.
-   * \param len The length of the sub-layout.
-   * \return A newly constructed Layout object.
-   */
-  Layout Sublayout(size_t pos, size_t len) const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    if (pos > ndim()) return Layout::Undef();
-    if (pos + len > ndim()) len = ndim() - pos;
-    std::ostringstream new_layout;
-    for (size_t i = pos; i < pos + len; ++i) {
-      if (IsSubdim(layout_simplified[i]->value)) {
-        auto block_size = this->Subsizeof(layout_simplified[i]->value);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << static_cast<char>(layout_simplified[i]->value);
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*! \return A newly constructed reversed Layout object. */
-  Layout Reverse() const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    if (!this->defined()) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-      if (IsSubdim(layout_simplified[i]->value)) {
-        auto block_size = this->Subsizeof(layout_simplified[i]->value);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified[i]->value;
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*!
-   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
-   * \param dim The source dimension to be split. It must be a super-dimension.
-   * \param target_pos The target position of the newly split sub-dimension.
-   * \param size size of the sub-dimension.
-   * \return A newly constructed Layout object.
-   */
-  Layout Split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    const std::string &name = operator->()->name;
-    CHECK(target_pos <= this->ndim()) << "Invalid split position "
-                                      << target_pos << " for layout " << name;
-    CHECK(IsSuperdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->Contains(dim)) << "Axis " << dim << " does not exist in " << name;
-    CHECK(!this->Contains(ToSubdim(dim))) << "Dimension " << dim
-                                           << " has already been split in "
-                                           << name;
-    CHECK(size > 0) << "Invalid split size " << size;
-    std::ostringstream new_layout;
-    for (size_t i = 0; i <= this->ndim(); ++i) {
-      if (i == target_pos) {
-        new_layout << size << Layout::ToSubdim(dim);
-      }
-      if (i == this->ndim()) break;
-      new_layout << this->at(i);
-    }
-    Layout x(new_layout.str());
-    return x;
-  }
-
-
-  /*! \return number of dimensions */
-  size_t ndim() const {
-    return operator->()->layout_simplified.size();
-  }
-
-  /*! \return number of super dimensions */
-  size_t ndim_super() const {
-    size_t ct = 0;
-    for (auto x : operator->()->layout_simplified) {
-      if (IsSuperdim(x))
-        ct++;
-    }
-    return ct;
-  }
-
-  /*!
-   * \brief The description of the \p i-th dimension.
-   *        If it is a sub-dimension, the size will be returned as well,
-   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
-   * \param i The position
-   * \return the description of the dimension.
-   */
-  std::string at(size_t i) const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    CHECK_LT(i, this->ndim()) << "position " << i
-                              << " exceeds ndim=" << this->ndim();
-    std::ostringstream repr;
-    if (IsSubdim(layout_simplified[i]->value)) {
-      auto factor = Subsizeof(layout_simplified[i]->value);
-      CHECK_GT(factor, 0);
-      repr << factor;
-    }
-    repr << static_cast<char>(layout_simplified[i]->value);
-    return repr.str();
-  }
-
-  /*!
-   * \brief return the index of the input dimension.
-   *        If it is not found in the layout or the layout is undefined,
-   *        return -1.
-   * \param dim the input dimension.
-   * \return the index or -1 if not found.
-   */
-  int32_t Indexof(LayoutDim dim) const {
-    if (!this->defined()) return -1;
-    else if (IsSuperdim(dim)) return operator->()->superdim_pos[dim - 'A']->value;
-    else if (IsSubdim(dim)) return operator->()->subdim_pos[dim - 'a']->value;
-    return -1;
-  }
-
-  /*!
-   * \param dim the input super-dimension or sub-dimension.
-   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
-   *         or the size of \p dim itself (if \p dim is a sub-dimension).
-   *         Return -1 if \p dim is not in the layout or the layout is undefined.
-   */
-  int64_t Subsizeof(LayoutDim dim) const {
-    CHECK(IsSuperdim(dim) || IsSubdim(dim)) << "Invalid dim " << dim;
-    if (!this->defined() || !this->Contains(ToSubdim(dim))) {
-      return -1;
-    }
-    int idx = ToSubdim(dim) - 'a';
-    return operator->()->subdim_size[idx]->value;
-  }
-
-  /*!
-   * \brief Whether the layout contains a dimension.
-   * \param dim dimension to be checked.
-   * \return Whether the layout contains the dimension.
-   */
-  bool Contains(LayoutDim dim) const {
-    if (IsSuperdim(dim)) {
-      return operator->()->superdim_pos[dim-'A']->value >= 0;
-    } else if (IsSubdim(dim)) {
-      return operator->()->subdim_pos[dim-'a']->value >= 0;
-    }
-    return false;
-  }
-
-  LayoutDim operator[](size_t i) const {
-    return operator->()->layout_simplified[i];
-  }
-
-  /*! \return whether the layout is defined */
-  bool defined() const {
-    return operator->()->name != "__undef__";
-  }
-  /*! \return the string description of the layout */
-  const std::string& name() const {
-    return operator->()->name;
-  }
-
-  /*!
-   * \brief Whether the two layouts are equal.
-   * \param rhs Another layout.
-   * \return whether the two layouts are equal.
-   */
-  bool Equals(const Layout &rhs) const {
-    return operator->()->name == rhs->name;
-  }
-
-  /*!
- * \brief allow output string of layout to ostream
- * \param os the output stream
- * \param l the layout
- * \return the ostream
- */
-  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
-    os << l.name();
-    return os;
-  }
-
-  using ContainerType = LayoutNode;
-};
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout);
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout);
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_OP_LAYOUT_H_
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index e05b24d967bc..963257a14961 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -3,12 +3,12 @@
  * \file convolution.cc
  * \brief Convolution operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
 
 #include "../../pass/alter_op_layout.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -31,32 +31,36 @@ bool Conv2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
-  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
-      data->shape, in_layout, kNCHW);
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
     CHECK_EQ(param->dilation.size(), 2);
-    std::vector<IndexExpr> wshape(
+    Array<IndexExpr> wshape(
        {param->channels,
          dshape_nchw[1] / param->groups,
          param->kernel_size[0],
          param->kernel_size[1]});
-    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape = trans_kernel_layout.BackwardShape(wshape);
     channels = param->channels;
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
@@ -65,7 +69,7 @@ bool Conv2DRel(const Array<Type>& types,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       CHECK_EQ(param->kernel_size.size(), 2);
       // check the size
@@ -73,13 +77,13 @@ bool Conv2DRel(const Array<Type>& types,
             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "Conv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size
-          << " wshape=" << Array<IndexExpr>(wshape);
+          << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
       CHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels
-          << " wshape=" << Array<IndexExpr>(wshape);
+          << " wshape=" << wshape;
     }
     CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[1]));
     channels = wshape[0];
@@ -87,15 +91,15 @@ bool Conv2DRel(const Array<Type>& types,
     dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
   }
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
 
-  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
-  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  oshape.Set(2, (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1);
+  oshape.Set(3, (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1);
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
@@ -193,33 +197,38 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
     << "Conv only support output layouts that are convertible from NCHW."
     << " But got " << out_layout;
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+  auto dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
     CHECK_EQ(param->dilation.size(), 2);
 
-    std::vector<IndexExpr> wshape({dshape_nchw[1],
-                                   param->channels / param->groups,
-                                   param->kernel_size[0],
-                                   param->kernel_size[1]});
+    Array<IndexExpr> wshape({dshape_nchw[1],
+                             param->channels / param->groups,
+                             param->kernel_size[0],
+                             param->kernel_size[1]});
 
-    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape = trans_kernel_layout.BackwardShape(wshape);
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
     channels = param->channels;
@@ -229,7 +238,7 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       CHECK_EQ(param->kernel_size.size(), 2);
       // check the size
@@ -251,17 +260,17 @@ bool Conv2DTransposeRel(const Array<Type>& types,
     dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
   }
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
-  oshape[2] = (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
-               2 * param->padding[0] + param->output_padding[0]);
-  oshape[3] = (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
-               2 * param->padding[1] + param->output_padding[1]);
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  oshape.Set(2, (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+                 2 * param->padding[0] + param->output_padding[0]));
+  oshape.Set(3, (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+                 2 * param->padding[1] + param->output_padding[1]));
 
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
@@ -349,20 +358,24 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
-  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
-      data->shape, in_layout, kNCHW);
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
@@ -384,15 +397,15 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   // can handle this correctly in alter_op_layout.
 
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
 
-  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
-  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  oshape.Set(2, (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1);
+  oshape.Set(3, (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1);
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 7ed43d0df019..59f68d9d8880 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -4,6 +4,7 @@
  * \brief Property def of nn operators.
  */
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/image.h>
@@ -14,7 +15,6 @@
 #include "../type_relations.h"
 #include "../../pass/alter_op_layout.h"
 #include "../op_common.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -654,5 +654,68 @@ axis to be the last item in the input shape.
 .set_support_level(1)
 .add_type_rel("BatchNorm", BatchNormRel);
 
+
+// relay.nn.batch_matmul
+bool BatchMatmulRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* x = types[0].as<TensorTypeNode>();
+  const auto* y = types[1].as<TensorTypeNode>();
+  if (x == nullptr || y == nullptr) return false;
+  if (x->shape.size() != 3 || y->shape.size() != 3) return false;
+  CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
+      << "BatchDot: batch dimension doesn't match, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+  CHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
+      << "BatchDot: shapes of x and y is inconsistent, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+
+  Array<tvm::Expr> oshape = x->shape;
+  oshape.Set(2, y->shape[1]);
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, x->dtype));
+  return true;
+}
+
+
+// Positional relay function to create batch_matmul operator used by frontend FFI.
+Expr MakeBatchMatmul(Expr x,
+                     Expr y) {
+  static const Op& op = Op::Get("nn.batch_matmul");
+  return CallNode::make(op, {x, y}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.batch_matmul")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBatchMatmul, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.batch_matmul")
+.describe(R"code(Computes matrix multiplication of `x` and `y` when `x` and `y`
+are data in batch.
+
+.. math::
+
+  batch\_matmul(x, y)[i, :, :] = matmul(x[i, :, :], y[i, :, :]^T)
+
+- **x**: `(b, m, k)`
+- **y**: `(b, n, k)`
+- **out**: `(b, m, n)`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("x", "3D Tensor", "First input.")
+.add_argument("y", "3D Tensor", "Second input.")
+.set_support_level(10)
+.add_type_rel("BatchMatmul", BatchMatmulRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index dc99f05f4d2d..5bab6399151a 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -3,12 +3,12 @@
  * \file pad.cc
  * \brief Implementation of operator pad
  */
-#include <tvm/ir_operator.h>
+#include <tvm/data_layout.h>
+#include <tvm/expr_operator.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn.h>
 #include <vector>
-#include "../layout.h"
 #include "../op_common.h"
 
 namespace tvm {
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 8fd33e1f3cdc..23704693732b 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -3,12 +3,12 @@
  * \file pooling.cc
  * \brief Pooling operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn/pooling.h>
 #include <vector>
-#include "../layout.h"
 #include "../../pass/alter_op_layout.h"
 
 namespace tvm {
@@ -32,14 +32,15 @@ Array<Array<Layout> > Pool2DInferCorrectLayout(
 
     Layout raw_layout(params->layout);
     Layout input = new_in_layouts[0];
-    if (input.Indexof('W') == raw_layout.Indexof('W') &&
-        input.Indexof('H') == raw_layout.Indexof('H') &&
-        !input.Contains('w') && !input.Contains('h')) {
+    if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
+    input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h'))) {
       params->layout = input.name();  // modify self to follow the input layout
     }
   }
 
-  return Array<Array<Layout> >{{params->layout}, {params->layout}};
+  Layout inferred_layout(params->layout);
+  return Array<Array<Layout> >{{inferred_layout}, {inferred_layout}};
 }
 
 template <typename AttrType>
@@ -59,13 +60,13 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains('H') && layout.Contains('W') &&
-        !layout.Contains('h') && !layout.Contains('w'))
+  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.Indexof('H');
-  const auto widx = layout.Indexof('W');
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
 
   IndexExpr pad_h, pad_w;
   if (param->padding.size() == 1) {
@@ -125,6 +126,7 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
                             const Array<Tensor>& inputs,
                             const Type& out_type,
                             const Target& target) {
+  static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
   auto pool_size = param->pool_size;
@@ -132,10 +134,13 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
-  CHECK(layout.Convertible(Layout("NCHW")))
+
+  CHECK(BijectiveLayoutNode::make(layout, kNCHW).defined())
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.Indexof('h'), -1) << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.Indexof('w'), -1) << "max_pool2d does not support input split on width";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+      << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+      << "max_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
@@ -271,13 +276,13 @@ bool GlobalPool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains('H') && layout.Contains('W') &&
-        !layout.Contains('h') && !layout.Contains('w'))
+  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.Indexof('H');
-  const auto widx = layout.Indexof('W');
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   oshape.Set(hidx, 1);
   oshape.Set(widx, 1);
@@ -293,14 +298,15 @@ Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
                                   const Array<Tensor>& inputs,
                                   const Type& out_type,
                                   const Target& target) {
+  static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<GlobalPool2DAttrs>();
   CHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(layout.Convertible(Layout("NCHW")))
+  CHECK(BijectiveLayoutNode::make(layout, kNCHW).defined())
     << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.Indexof('h'), -1)
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
     << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.Indexof('w'), -1)
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
     << "global_avg_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index d386437ae15b..48a7a04ebb8a 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -3,6 +3,7 @@
  * \file upsampling.cc
  * \brief upsampling operator
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/op_attr_types.h>
@@ -11,7 +12,6 @@
 #include <topi/nn/upsampling.h>
 #include <vector>
 #include "../op_common.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -31,18 +31,20 @@ bool UpSamplingRel(const Array<Type>& types,
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(layout_converter.defined())
     << "UpSampling only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
-  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+  auto oshape = layout_converter.ForwardShape(data->shape);
 
-  oshape[2] = oshape[2] * param->scale;
-  oshape[3] = oshape[3] * param->scale;
+  oshape.Set(2, oshape[2] * param->scale);
+  oshape.Set(3, oshape[3] * param->scale);
 
   // assign output type
   reporter->Assign(types[1],
-                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                   TensorTypeNode::make(layout_converter.BackwardShape(oshape),
                                         data->dtype));
   return true;
 }
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 97adbc7f4ca8..b8305c44c037 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -82,6 +82,18 @@ RELAY_REGISTER_BINARY_OP("mod")
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::mod));
 
 
+RELAY_REGISTER_BINARY_OP("logical_and")
+.describe("Elementwise logical AND with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::logical_and));
+
+
+RELAY_REGISTER_BINARY_OP("logical_or")
+.describe("Elementwise logical OR with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::logical_or));
+
+
 RELAY_REGISTER_CMP_OP("equal")
 .describe("Elementwise equal compare with broadcasting")
 .set_support_level(4)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index b38999d1b1b7..7aa98e3fd87a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -5,8 +5,9 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/transform.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/ir.h>
+#include <tvm/data_layout.h>
 #include <topi/transform.h>
 #include <topi/elemwise.h>
 #include <topi/broadcast.h>
@@ -16,7 +17,6 @@
 #include "../op_common.h"
 #include "../../../arithmetic/compute_expr.h"
 #include "../../pass/alter_op_layout.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -206,6 +206,15 @@ bool ConcatenateRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> ConcatenateCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const ConcatenateAttrs *param = attrs.as<ConcatenateAttrs>();
+  CHECK(param != nullptr);
+  return { topi::concatenate(inputs, param->axis) };
+}
+
 Array<Array<Layout>> ConcatenateLayout(
     const Attrs& attrs,
     const Array<Layout>& new_in_layouts,
@@ -218,7 +227,7 @@ Array<Array<Layout>> ConcatenateLayout(
 
   Layout ret;
   if (new_in_layouts.defined()) {  // this function is called after some operators are alternated.
-    Layout::LayoutDim concate_dim = old_in_layouts[0][axis];
+    const auto& concate_dim = old_in_layouts[0][axis];
     for (size_t i = 0; i < new_in_layouts.size(); ++i) {
       if (new_in_layouts[i].ndim() > axis &&
           new_in_layouts[i][axis] == concate_dim) {
@@ -234,7 +243,7 @@ Array<Array<Layout>> ConcatenateLayout(
       }
     }
 
-    if (ret.ndim() <= axis || Layout::IsSubdim(ret[axis])) {
+    if (ret.ndim() <= axis || !ret[axis].IsPrimal()) {
       return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
     }
   }
@@ -268,7 +277,96 @@ RELAY_REGISTER_OP("concatenate")
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
 .add_type_rel("Concatenate", ConcatenateRel)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout);
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout)
+.set_attr<FTVMCompute>("FTVMCompute", ConcatenateCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+TVM_REGISTER_NODE_TYPE(StackAttrs);
+
+bool StackRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* tensor_tuple = types[0].as<TupleTypeNode>();
+  if (tensor_tuple == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "cast: expect input type to be TupleType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<StackAttrs>();
+  const auto& first = Downcast<TensorType>(tensor_tuple->fields[0]);
+  // Sanity check: ndim and dtype.
+  const int ndim = static_cast<int>(first->shape.size());
+  const DataType dtype = first->dtype;
+  for (const Type& ele : tensor_tuple->fields) {
+    const auto& e = Downcast<TensorType>(ele);
+    int e_ndim = static_cast<int>(e->shape.size());
+    const DataType& e_dtype = e->dtype;
+    CHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
+    CHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
+  }
+  // Sanity check: axis
+  int axis = param->axis;
+  CHECK(-ndim <= axis && axis < ndim)
+    << "stack only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  axis = axis < 0 ? ndim + axis + 1 : axis;
+  // Calculate shape
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + 1);
+  const int stack_dim = static_cast<int>(tensor_tuple->fields.size());
+  for (int i = 0; i < axis; ++i) {
+    oshape.emplace_back(first->shape[i]);
+  }
+  oshape.emplace_back(stack_dim);
+  for (int i = axis; i < ndim; ++i) {
+    oshape.emplace_back(first->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, dtype));
+  return true;
+}
+
+Array<Tensor> StackCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  const StackAttrs *param = attrs.as<StackAttrs>();
+  CHECK(param != nullptr);
+  return { topi::stack(inputs, param->axis) };
+}
+
+Expr MakeStack(Expr data,
+               int axis) {
+  auto attrs = make_node<StackAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("stack");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.stack")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeStack, args, rv);
+});
+
+RELAY_REGISTER_OP("stack")
+.describe(R"code(Stack the input tensors along the given axis.
+
+- **data** : A list of tensors.
+
+- **axis** : The axis along which the tensors are stacked.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.StackAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input list of tensors.")
+.set_support_level(1)
+.add_type_rel("Stack", StackRel)
+.set_attr<FTVMCompute>("FTVMCompute", StackCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 /* relay.transpose */
 TVM_REGISTER_NODE_TYPE(TransposeAttrs);
@@ -880,6 +978,232 @@ and type as the input array.
 .set_attr<FTVMCompute>("FTVMCompute", FullLikeCompute)
 .set_attr<TOpPattern>("TOpPattern", kElemWise);
 
+// arange operator
+TVM_REGISTER_NODE_TYPE(ArangeAttrs);
+
+bool ArangeRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 1);
+  const ArangeAttrs* param = attrs.as<ArangeAttrs>();
+  IndexExpr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
+      tvm::cast(tvm::Float(32), param->stop - param->start) / param->step));
+  if (const tvm::ir::IntImm* val = num_elem.as<tvm::ir::IntImm>()) {
+    CHECK_GT(val->value, 0)
+        << "Invalid arange attributes (start, stop, step): " << param->start
+        << ", " << param->stop << ", " << param->step;
+  }
+  reporter->Assign(types[0], TensorTypeNode::make({num_elem}, param->dtype));
+  return true;
+}
+
+Array<Tensor> ArangeCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const ArangeAttrs* param = attrs.as<ArangeAttrs>();
+  return { topi::arange(param->start, param->stop, param->step, param->dtype) };
+}
+
+Expr MakeArange(tvm::Expr start,
+                tvm::Expr stop,
+                tvm::Expr step,
+                DataType dtype) {
+  auto attrs = make_node<ArangeAttrs>();
+  attrs->start = std::move(start);
+  attrs->stop = std::move(stop);
+  attrs->step = std::move(step);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("arange");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.arange")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeArange, args, rv);
+});
+
+RELAY_REGISTER_OP("arange")
+.describe(R"code(Returns evenly spaced values within a given interval.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ArangeAttrs")
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("Arange", ArangeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArangeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+// repeat operator
+TVM_REGISTER_NODE_TYPE(RepeatAttrs);
+
+bool RepeatRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "repeat: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<RepeatAttrs>();
+  const int ndim = static_cast<int>(data->shape.size());
+  const int repeats = param->repeats;
+  const int axis = param->axis;
+  CHECK(repeats >= 1)
+    << "repeat only accepts `repeats >= 1`"
+    << ", but got repeats = " << repeats;
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  const int pivot = axis < 0 ? ndim + axis : axis;
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + repeats);
+  for (int i = 0; i < pivot; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  oshape.emplace_back(data->shape[pivot] * repeats);
+  for (int i = pivot + 1; i < ndim; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> RepeatCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const RepeatAttrs *param = attrs.as<RepeatAttrs>();
+  CHECK(param != nullptr);
+  return { topi::repeat(inputs[0], param->repeats, param->axis) };
+}
+
+Expr MakeRepeat(Expr data,
+                    int repeats,
+                    int axis) {
+  auto attrs = make_node<RepeatAttrs>();
+  attrs->repeats = repeats;
+  attrs->axis = axis;
+  static const Op& op = Op::Get("repeat");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.repeat")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeRepeat, args, rv);
+});
+
+RELAY_REGISTER_OP("repeat")
+.describe(R"code(Repeat elements of an array `repeats` times along axis `axis`
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.Repeat")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Repeat", RepeatRel)
+.set_attr<FTVMCompute>("FTVMCompute", RepeatCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+// tile operator
+TVM_REGISTER_NODE_TYPE(TileAttrs);
+
+bool TileRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "tile: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<TileAttrs>();
+  const size_t ndim = data->shape.size();
+  const Array<Integer>& reps = param->reps;
+  // check dimension match
+  CHECK(!reps.defined())
+    << "repetition array is not defined. data.ndim = " << ndim;
+  const size_t rndim = reps.size();
+  size_t tndim = (ndim > rndim) ? ndim : rndim;
+  // re-construct data shape or reps shape
+  std::vector<IndexExpr> data_shape;
+  std::vector<IndexExpr> reps_shape;
+  data_shape.reserve(tndim);
+  reps_shape.reserve(tndim);
+  if (ndim == rndim) {
+    for (size_t i = 0; i < tndim; ++i) {
+        data_shape.emplace_back(data->shape[i]);
+        reps_shape.emplace_back(reps[i]);
+    }
+  } else if (ndim > rndim) {
+    for (size_t i = 0; i < ndim; ++i)
+        data_shape.emplace_back(data->shape[i]);
+    for (size_t i = 0; i < (ndim - rndim); ++i)
+        reps_shape.emplace_back(1);
+    for (size_t i = 0; i < rndim; ++i)
+        reps_shape.emplace_back(reps[i]);
+  } else {
+    for (size_t i = 0; i < rndim; ++i)
+        reps_shape.emplace_back(reps[i]);
+  }
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(tndim);
+  for (size_t i = 0; i < tndim; ++i) {
+    oshape.emplace_back(data_shape[i] * reps_shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> TileCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const TileAttrs *param = attrs.as<TileAttrs>();
+  CHECK(param != nullptr);
+  return { topi::tile(inputs[0], param->reps) };
+}
+
+Expr MakeTile(Expr data,
+              Array<Integer> reps) {
+  auto attrs = make_node<TileAttrs>();
+  attrs->reps = reps;
+  static const Op& op = Op::Get("tile");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.tile")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeTile, args, rv);
+});
+
+RELAY_REGISTER_OP("tile")
+.describe(R"code(Repeat the whole array multiple times.
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.Tile")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Tile", TileRel)
+.set_attr<FTVMCompute>("FTVMCompute", TileCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
 // where operator
 bool WhereRel(const Array<Type>& types,
               int num_inputs,
@@ -1192,6 +1516,16 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+      << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 bool StridedSliceRel(const Array<Type>& types,
@@ -1546,15 +1880,6 @@ Expr MakeSliceLike(Expr data,
   return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
 }
 
-// Adapter function to make int array.
-Array<Integer> GetIntArray(Array<IndexExpr> arr) {
-  for (size_t i = 0; i < arr.size(); ++i) {
-    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
-        << "Expect an int array";
-  }
-  return Array<Integer>(arr.node_);
-}
-
 Array<Tensor> SliceLikeCompute(const Attrs& attrs,
                                const Array<Tensor>& inputs,
                                const Type& out_type,
@@ -1625,46 +1950,10 @@ Array<Tensor> LayoutTransformCompute(const Attrs& attrs,
                                      const Array<Tensor>& inputs,
                                      const Type& out_type,
                                      const Target& target) {
-  const LayoutTransformAttrs *param = attrs.as<LayoutTransformAttrs>();
+  const auto* param = attrs.as<LayoutTransformAttrs>();
   CHECK(param != nullptr);
-
-  Layout src_layout(param->src_layout);
-  Layout dst_layout(param->dst_layout);
-
-  if (src_layout.Equals(dst_layout)) {
-    return Array<Tensor>{ inputs[0] };
-  }
-
-  CHECK(src_layout.defined() && dst_layout.defined())
-    << "cannot convert from/to undefined layout";
-  CHECK(src_layout.Convertible(dst_layout))
-    << "cannot convert from " << param->src_layout << " to " << param->dst_layout;
-
-  const auto& out_shape = ConvertLayout(inputs[0]->shape, src_layout, dst_layout);
-  return Array<Tensor> {
-      topi::layout_transform(inputs[0], out_shape, [&](const Array<tvm::Var>& dst_indices) {
-        std::vector<tvm::Expr> dst_to_src_indices;
-        for (size_t i = 0; i < src_layout.ndim(); ++i) {
-          Layout::LayoutDim src_axis = src_layout[i];
-          int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_axis));
-          int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_axis));
-          int32_t src_factor = static_cast<int32_t>(src_layout.Subsizeof(src_axis));
-          int32_t dst_factor = static_cast<int32_t>(dst_layout.Subsizeof(src_axis));
-
-          tvm::Expr src_index(dst_indices[dst_major_pos]);
-          if (dst_minor_pos >= 0) {
-            CHECK_GT(dst_factor, 0);
-            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
-          }
-          if (Layout::IsSuperdim(src_axis) && src_factor > 0) {
-            src_index = src_index / src_factor;
-          } else if (Layout::IsSubdim(src_axis) && src_factor > 0) {
-            src_index = src_index % src_factor;
-          }
-          dst_to_src_indices.push_back(src_index);
-        }
-        return Array<tvm::Expr>(dst_to_src_indices);
-      })
+  return Array<Tensor>{
+    topi::layout_transform(inputs[0], param->src_layout, param->dst_layout)
   };
 }
 
@@ -1681,10 +1970,12 @@ bool LayoutTransformRel(const Array<Type>& types,
 
   CHECK(src_layout.defined() && dst_layout.defined())
     << "cannot convert from/to undefined layout";
-  CHECK(src_layout.Convertible(dst_layout))
+
+  auto layout_converter = BijectiveLayoutNode::make(src_layout, dst_layout);
+  CHECK(layout_converter.defined())
     << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
 
-  const auto& out_shape = ConvertLayout(data->shape, src_layout, dst_layout);
+  const auto& out_shape = layout_converter.ForwardShape(data->shape);
   reporter->Assign(types[1], TensorTypeNode::make(out_shape, data->dtype));
   return true;
 }
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 06720d67713c..cfcc130564c0 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -178,5 +178,16 @@ RELAY_REGISTER_UNARY_OP("negative")
 .set_support_level(3)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::negative));
 
+
+RELAY_REGISTER_UNARY_OP("logical_not")
+.describe(R"code(Returns the logical inverse of input array, computed element-wise.
+
+.. math::
+   ~(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::logical_not));
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 9152f0677616..0ae7ab2f9e33 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -90,8 +90,8 @@ bool BroadcastRel(const Array<Type>& types,
                   const Attrs& attrs,
                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
-                  << "Out: " << types[2] << std::endl;
+  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+                  << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
@@ -108,8 +108,8 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const Attrs& attrs,
                       const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
-                  << "Out: " << types[2] << std::endl;
+  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+                  << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 55db8862e849..04f105c44744 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -70,8 +70,10 @@ RELAY_REGISTER_OP("vision.multibox_prior")
 
 TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
 
-bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs,
-                             const Attrs& attrs, const TypeReporter& reporter) {
+bool MultiBoxTransformLocRel(const Array<Type>& types,
+                             int num_inputs,
+                             const Attrs& attrs,
+                             const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 4);
 
   const auto* cls_prob = types[0].as<TensorTypeNode>();
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 3e3f73bc6cb4..6a94da032196 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -9,7 +9,54 @@
 namespace tvm {
 namespace relay {
 
-TVM_REGISTER_NODE_TYPE(NMSAttrs);
+TVM_REGISTER_NODE_TYPE(GetValidCountsAttrs);
+
+bool GetValidCountRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto& dshape = data->shape;
+  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+
+  std::vector<IndexExpr> oshape({data->shape[0]});
+  std::vector<Type> fields;
+  fields.push_back(TensorTypeNode::make(oshape, Int(32)));
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
+
+  // assign output type
+  reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeGetValidCounts(Expr data,
+                        double score_threshold) {
+  auto attrs = make_node<GetValidCountsAttrs>();
+  attrs->score_threshold = score_threshold;
+  static const Op& op = Op::Get("vision.get_valid_counts");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.get_valid_counts")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeGetValidCounts, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.get_valid_counts")
+.describe(R"doc(Get valid count of bounding boxes given
+a score threshold. Also moves valid boxes to the top of
+input data.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(5)
+.add_type_rel("GetValidCount", GetValidCountRel);
+
+
+TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types,
             int num_inputs,
@@ -18,39 +65,56 @@ bool NMSRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
+  const NonMaximumSuppressionAttrs* param =
+    attrs.as<NonMaximumSuppressionAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
   CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
   CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
 
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  if (param->return_indices) {
+    std::vector<IndexExpr> oshape({dshape[0], dshape[1]});
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, Int(32)));
+  } else {
+    reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  }
   return true;
 }
 
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
-             double overlap_threshold,
+             int max_output_size,
+             double iou_threshold,
              bool force_suppress,
-             int topk) {
-  auto attrs = make_node<NMSAttrs>();
-  attrs->overlap_threshold = overlap_threshold;
+             int top_k,
+             int id_index,
+             bool return_indices,
+             bool invalid_to_bottom) {
+  auto attrs = make_node<NonMaximumSuppressionAttrs>();
+  attrs->max_output_size = max_output_size;
+  attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
-  attrs->topk = topk;
-  static const Op& op = Op::Get("vision.nms");
+  attrs->top_k = top_k;
+  attrs->id_index = id_index;
+  attrs->return_indices = return_indices;
+  attrs->invalid_to_bottom = invalid_to_bottom;
+  static const Op& op = Op::Get("vision.non_max_suppression");
   return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
 }
 
 
-TVM_REGISTER_API("relay.op.vision._make.nms")
+TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 5>(MakeNMS, args, rv);
+  runtime::detail::unpack_call<Expr, 9>(MakeNMS, args, rv);
 });
 
 
-RELAY_REGISTER_OP("vision.nms")
-.describe(R"doc("Non-maximum suppression."
+RELAY_REGISTER_OP("vision.non_max_suppression")
+.describe(R"doc(Non-maximum suppression. The input boxes should
+be in the format of [class_id, score, left, top, right, bottom].
+Set id_index to be -1 to ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index e46eaf2207fb..6dbc76599708 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -63,5 +63,72 @@ RELAY_REGISTER_OP("vision.roi_align")
 .set_support_level(5)
 .add_type_rel("ROIAlign", ROIAlignRel);
 
+TVM_REGISTER_NODE_TYPE(ProposalAttrs);
+
+bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  auto proposal_attrs = attrs.as<ProposalAttrs>();
+  CHECK_EQ(types.size(), 4);
+  const auto* cls_prob = types[0].as<TensorTypeNode>();
+  const auto* bbox_pred = types[1].as<TensorTypeNode>();
+  const auto* im_info = types[2].as<TensorTypeNode>();
+
+  if (!cls_prob || !bbox_pred || !im_info) {
+    return false;
+  }
+
+  CHECK_EQ(cls_prob->shape.size(), 4U)
+      << "The dimension of class probability should be 4, but received " << cls_prob->shape.size();
+  CHECK_EQ(bbox_pred->shape.size(), 4U)
+      << "The dimension of box prediction should be 4, but received " << bbox_pred->shape.size();
+  CHECK_EQ(im_info->shape.size(), 2U)
+      << "The dimension of image info should be 2, but received " << im_info->shape.size();
+  CHECK(reporter->AssertEQ(im_info->shape[1], 3));
+
+  auto batch = cls_prob->shape[0];
+
+  std::vector<IndexExpr> oshape(
+      {batch * proposal_attrs->rpn_post_nms_top_n, 5});
+  reporter->Assign(types[3], TensorTypeNode::make(oshape, cls_prob->dtype));
+  return true;
+}
+
+Expr MakeProposal(Expr cls_prob, Expr bbox_pred, Expr im_info, Array<IndexExpr> scales,
+                  Array<IndexExpr> ratios, int feature_stride, double threshold,
+                  int rpn_pre_nms_top_n, int rpn_post_nms_top_n, int rpn_min_size,
+                  bool iou_loss) {
+  auto attrs = make_node<ProposalAttrs>();
+  attrs->scales = scales;
+  attrs->ratios = ratios;
+  attrs->feature_stride = feature_stride;
+  attrs->threshold = threshold;
+  attrs->rpn_pre_nms_top_n = rpn_pre_nms_top_n;
+  attrs->rpn_post_nms_top_n = rpn_post_nms_top_n;
+  attrs->rpn_min_size = rpn_min_size;
+  attrs->iou_loss = iou_loss;
+  static const Op& op = Op::Get("vision.proposal");
+  return CallNode::make(op, {cls_prob, bbox_pred, im_info}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.vision._make.proposal")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 11>(MakeProposal, args, rv);
+  });
+
+RELAY_REGISTER_OP("vision.proposal")
+    .describe(R"code(Generate region proposals via RPN.
+
+ - **cls_prob**: 4-D with shape [batch, 2 * num_anchors, height, width].
+ - **bbox_pred**: 4-D with shape [batch, 4 * num_anchors, height, width].
+ - **im_info**: 2-D with shape [batch, 3].
+ - **out**: 2-D with shape [batch * rpn_post_nms_top_n, 5].
+ )code" TVM_ADD_FILELINE)
+.set_num_inputs(3)
+.add_argument("cls_prob", "Tensor", "Score of how likely proposal is object")
+.add_argument("bbox_pred", "Tensor", "BBox predicted deltas from anchors for proposals")
+.add_argument("im_info", "Tensor", "Image size and scale")
+.set_support_level(5)
+.add_type_rel("Proposal", ProposalRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
new file mode 100644
index 000000000000..b826d4c6e8e2
--- /dev/null
+++ b/src/relay/op/vision/yolo.cc
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file yolo.cc
+ * \brief Yolo related operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+#include <topi/vision/reorg.h>
+#include <vector>
+#include "../op_common.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(YoloReorgAttrs);
+
+/*!
+* \brief YoloReorgRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool YoloReorgRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const YoloReorgAttrs* param = attrs.as<YoloReorgAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
+  std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+  oshape[1] = oshape[1] * param->stride * param->stride;
+  oshape[2] = oshape[2] / param->stride;
+  oshape[3] = oshape[3] / param->stride;
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeYoloReorg(Expr data,
+                   Integer stride) {
+  auto attrs = make_node<YoloReorgAttrs>();
+  attrs->stride = stride;
+  static const Op& op = Op::Get("vision.yolo_reorg");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.yolo_reorg")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeYoloReorg, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.yolo_reorg")
+.describe(R"doc("Yolo reorg operation. This layer reorganize the output.
+Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_num_inputs(1)
+.set_support_level(5)
+.set_attrs_type_key("relay.attrs.YoloReorgAttrs")
+.add_type_rel("YoloReorg", YoloReorgRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* params = attrs.as<YoloReorgAttrs>();
+  CHECK(params != nullptr);
+  return Array<Tensor>{ topi::vision::reorg(inputs[0], params->stride) };
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index b33d68a174bc..fe624a6489c1 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -26,7 +26,7 @@ Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
   if (src_layout.Equals(dst_layout)) { return raw; }
   CHECK(src_layout.defined() && dst_layout.defined())
     << "Cannot insert layout transform because there are undefined layouts";
-  CHECK(src_layout.Convertible(dst_layout))
+  CHECK(BijectiveLayoutNode::make(src_layout, dst_layout).defined())
     << "Cannot insert layout transform because there are inconvertible layouts: "
     << src_layout << " v.s. " << dst_layout;
   static auto &transform_op = Op::Get("layout_transform");
@@ -34,7 +34,7 @@ Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
   attrs->src_layout = src_layout.name();
   attrs->dst_layout = dst_layout.name();
   Call transform = CallNode::make(transform_op, {raw}, Attrs{attrs});
-  return transform;
+  return std::move(transform);
 }
 
 // Memorize layout transform so we can reuse internal transformed nodes
diff --git a/src/relay/pass/alter_op_layout.h b/src/relay/pass/alter_op_layout.h
index fcb7b379a0ec..93d9ee52f687 100644
--- a/src/relay/pass/alter_op_layout.h
+++ b/src/relay/pass/alter_op_layout.h
@@ -9,10 +9,9 @@
 #ifndef TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
 #define TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/expr.h>
 
-#include "../op/layout.h"
-
 namespace tvm {
 namespace relay {
 
@@ -78,9 +77,9 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
 
     if (old_in_shapes[defined_idx].size() >= old_in_shapes[undef_idx].size()) {
       layouts.Set(undef_idx,
-                  layouts[defined_idx].Sublayout(
-                      old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
-                      old_in_shapes[undef_idx].size()));
+                  layouts[defined_idx].SubLayout(
+                  old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
+                  old_in_shapes[undef_idx].size()));
       return Array<Array<Layout> > {layouts, {layouts[defined_idx]}};
     } else {
       // only know the tensor with smaller dimensions,
@@ -90,21 +89,22 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
     }
   } else {
     // try to broadcast the tensors to the larger dimension
-    int large_idx = layouts[0].ndim_super() >= layouts[1].ndim_super() ? 0 : 1;
+    int large_idx = layouts[0].ndim_primal() >= layouts[1].ndim_primal() ? 0 : 1;
     int small_idx = 1 - large_idx;
     Layout ret = layouts[large_idx];
 
     // extract common part
     size_t i = layouts[large_idx].ndim();
     for (; i != 0; --i) {
-      auto dim = layouts[large_idx][i-1];
-      if (!layouts[small_idx].Contains(Layout::ToSuperdim(dim))) {
+      const auto& axis = layouts[large_idx][i-1];
+      if (!layouts[small_idx].Contains(axis.ToPrimal())) {
         break;
       }
     }
 
-    Layout common_part = layouts[large_idx].Sublayout(i, layouts[large_idx].ndim() - i);
-    if (!layouts[small_idx].Convertible(common_part)) {  // fail
+    Layout common_part = layouts[large_idx].SubLayout(i, layouts[large_idx].ndim() - i);
+    if (!BijectiveLayoutNode::make(layouts[small_idx], common_part).defined()) {
+      // not convertible
       return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
     }
 
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index cd2d29e80048..44b239919ce2 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -91,8 +91,10 @@ class BranchGroupFinder : private ExprVisitor {
     CHECK(attrs_b);
     const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
-    const auto shape_a = ConvertLayout(tweight_a->shape, attrs_a->kernel_layout, kOIHW);
-    const auto shape_b = ConvertLayout(tweight_b->shape, attrs_b->kernel_layout, kOIHW);
+    const auto shape_a = BijectiveLayoutNode::make(
+      Layout(attrs_a->kernel_layout), kOIHW).ForwardShape(tweight_a->shape);
+    const auto shape_b = BijectiveLayoutNode::make(
+      Layout(attrs_b->kernel_layout), kOIHW).ForwardShape(tweight_b->shape);
 
     return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
            eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
diff --git a/src/relay/pass/eliminate_common_subexpr.cc b/src/relay/pass/eliminate_common_subexpr.cc
new file mode 100644
index 000000000000..10e6f920f245
--- /dev/null
+++ b/src/relay/pass/eliminate_common_subexpr.cc
@@ -0,0 +1,72 @@
+/*!
+ * Copyright (c) 2019 by Contributors
+ *
+ * \file eliminate_common_subexpr.cc
+ * \brief Combine common subexpressions.
+ *
+ * This is an optimization pass that eliminates common subexpressions. During the pass, it tries
+ * to replace an expression with a previously appeared expression with the same input and
+ * attributes. The fskip callback argument allows us to skip specific expressions.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <unordered_map>
+#include "./pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+class CommonSubexprEliminator : public ExprMutator {
+ public:
+  explicit CommonSubexprEliminator(runtime::TypedPackedFunc<bool(Expr)> fskip): fskip_(fskip) {}
+
+  Expr VisitExpr_(const CallNode* call) final {
+    static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+    Expr new_expr = ExprMutator::VisitExpr_(call);
+    const CallNode* new_call = new_expr.as<CallNode>();
+    CHECK(new_call);
+    const OpNode* op = new_call->op.as<OpNode>();
+    AttrsEqual attrs_equal;
+
+    if (new_call->args.size() == 0 || op == nullptr || op_stateful.get(GetRef<Op>(op), false)) {
+      return new_expr;
+    }
+    if (fskip_ != nullptr && fskip_(new_expr)) {
+      return new_expr;
+    }
+
+    auto it = expr_map_.find(new_call->op);
+    if (it != expr_map_.end()) {
+      for (const CallNode* candidate : it->second) {
+        bool is_equivalent = true;
+        if (!attrs_equal(new_call->attrs, candidate->attrs)) {
+          continue;
+        }
+        for (size_t i = 0; i < new_call->args.size(); i++) {
+          if (!new_call->args[i].same_as(candidate->args[i]) &&
+              !IsEqualScalar(new_call->args[i], candidate->args[i])) {
+            is_equivalent = false;
+            break;
+          }
+        }
+        if (!is_equivalent) continue;
+        return GetRef<Call>(candidate);
+      }
+    }
+    expr_map_[new_call->op].push_back(new_call);
+    return new_expr;
+  }
+
+  std::unordered_map<Expr, std::vector<const CallNode*>, NodeHash, NodeEqual> expr_map_;
+  runtime::TypedPackedFunc<bool(Expr)> fskip_;
+};
+
+Expr EliminateCommonSubexpr(const Expr& expr, PackedFunc callback) {
+  return CommonSubexprEliminator(callback)(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass.eliminate_common_subexpr")
+.set_body_typed<Expr(Expr, PackedFunc)>(EliminateCommonSubexpr);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 270965886ab9..044cc4e5d9c9 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -6,12 +6,12 @@
  * \brief Fold axis scaling into weights of
  *  conv/dense operators.
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/pass.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
 #include "pattern_util.h"
 #include "pass_util.h"
-#include "../op/layout.h"
 
 
 namespace tvm {
@@ -435,8 +435,8 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
-  int c_big_axis = data_layout.Indexof('C');
-  int c_small_axis = data_layout.Indexof('c');
+  int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
+  int c_small_axis = data_layout.IndexOf(LayoutAxis::Get('c'));
 
   CHECK_GE(c_big_axis, 0);
   Message none = NullValue<Message>();
@@ -449,7 +449,7 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (kernel_layout.Indexof('i') < 0 &&
+  if (kernel_layout.IndexOf(LayoutAxis::Get('i')) < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     data_axes = {c_big_axis};
@@ -473,15 +473,15 @@ Expr Conv2DForwardRewrite(const Call& ref_call,
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
-  int c_big_axis = data_layout.Indexof('C');
+  int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(kernel_layout.Indexof('i'), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('i')), -1);
   CHECK(sdata->axes.size() == 1 &&
         c_big_axis == sdata->axes[0]->value);
-  int big_oc_axis = kernel_layout.Indexof('O');
-  int big_ic_axis = kernel_layout.Indexof('I');
+  int big_oc_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
+  int big_ic_axis = kernel_layout.IndexOf(LayoutAxis::Get('I'));
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, kernel_layout);
@@ -857,8 +857,8 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   CHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  int c_big_axis = out_layout.Indexof('C');
-  int c_small_axis = out_layout.Indexof('c');
+  int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
+  int c_small_axis = out_layout.IndexOf(LayoutAxis::Get('c'));
 
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
@@ -869,8 +869,8 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (kernel_layout.Indexof('o') < 0 &&
-      kernel_layout.Indexof('i') < 0 &&
+  if (kernel_layout.IndexOf(LayoutAxis::Get('o')) < 0 &&
+  kernel_layout.IndexOf(LayoutAxis::Get('i')) < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     return MessageNode::make({c_big_axis}, false);
@@ -891,16 +891,16 @@ Expr Conv2DBackwardTransform(const Call& call,
   CHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  int c_big_axis = out_layout.Indexof('C');
+  int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(kernel_layout.Indexof('o'), -1);
-  CHECK_EQ(kernel_layout.Indexof('i'), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('o')), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('i')), -1);
   CHECK(message->axes.size() == 1 &&
         c_big_axis == message->axes[0]->value);
 
-  int big_oc_axis = kernel_layout.Indexof('O');
+  int big_oc_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 3227a70f3e7c..66ff9caf4ae4 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -6,7 +6,7 @@
  * \brief This is a backend-aware optimization pass.
  *   Fuse necessary ops into a single one.
  */
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
@@ -208,12 +208,24 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     Node* node = graph_.node_map.at(call);
     static auto fpattern =
         Op::GetAttr<TOpPattern>("TOpPattern");
-    // setup pattern.
+    // Now we set the pattern of this call.
+    //
+    // If we see a call mentioning an operator we should mark it with its
+    // annotated pattern.
+    //
+    // If the pattern is not annotated we will default to opaque.
+    //
+    // Finally if the operator position is not a call node we will
+    // need to call Update, as it may be an arbitrary expression.
     OpPatternKind op_pattern = kOpaque;
     if (const OpNode* opnode = call->op.as<OpNode>()) {
       op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
+    } else {
+      this->Update(call->op, node, kOpaque);
     }
+
     node->pattern = op_pattern;
+    this->Update(call->op, nullptr, kOpaque);
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
     // pass the message back to all the children it references.
     for (size_t i = 0; i < call->args.size(); ++i) {
@@ -762,7 +774,7 @@ class FuseMutator : private ExprMutator {
       } else {
         // This is an intermediate node of a fused function
         // simply return the new call.
-        return new_call;
+        return std::move(new_call);
       }
     } else {
       return ExprMutator::VisitExpr_(call);
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 780490a45b0a..d564e02b5596 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -85,10 +85,10 @@ using ADValue = std::shared_ptr<ADValueNode>;
 
 /*! \brief AD over a program which generates a tensor output. */
 struct ADTensor : ADValueNode {
-  Expr foward;
+  Expr forward;
   mutable Expr reverse;  // must be a variable to avoid duplication
-  ADTensor(LetList* ll, const Expr& foward) :
-    foward(ll->Push(foward)), reverse(ll->Push(ZeroLike(this->foward))) { }
+  ADTensor(LetList* ll, const Expr& forward) :
+    forward(ll->Push(forward)), reverse(ll->Push(ZerosLike(this->forward))) { }
 };
 
 /*! \brief A staged representation of the program, we reflect
@@ -105,14 +105,14 @@ struct ADFunction : ADValueNode {
     func(func) { }
 };
 
-struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
+struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr &)> {
   const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
   std::vector<std::function<void(LetList* ll)>> backprop_actions;
   // we assume no closure so no need for lexical scoping
   std::unordered_map<Var, ADValue, NodeHash, NodeEqual> env;
   LetList* ll;
 
-  ReverseAD(LetList* ll) : ll(ll) { }
+  FirstOrderReverseAD(LetList* ll) : ll(ll) { }
 
   ADValue VisitExpr_(const OpNode* op) final {
     Op op_ref = GetRef<Op>(op);
@@ -121,21 +121,22 @@ struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
     return std::make_shared<ADFunction>([this, op_ref](const std::vector<ADValue>& args,
                                                        const Attrs& attrs,
                                                        const tvm::Array<Type>& type_args) {
-        std::vector<Expr> call_args;
-        for (const ADValue& adval : args) {
-          call_args.push_back(adval->get<ADTensor>().foward);
+      std::vector<Expr> call_args;
+      for (const ADValue& adval : args) {
+        call_args.push_back(adval->get<ADTensor>().forward);
+      }
+      auto orig = CallNode::make(op_ref, call_args, attrs, type_args);
+      auto ret = std::make_shared<ADTensor>(ll, orig);
+      backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
+        tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
+        CHECK(args.size() == rev.size());
+        for (size_t i = 0; i < args.size(); ++i) {
+          args[i]->get<ADTensor>().reverse =
+            ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
         }
-        auto orig = CallNode::make(op_ref, call_args, attrs, type_args);
-        auto ret = std::make_shared<ADTensor>(ll, orig);
-        backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            for (size_t i = 0; i < args.size(); ++i) {
-              args[i]->get<ADTensor>().reverse =
-                ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
-            }
-          });
-        return ret;
       });
+      return ret;
+    });
   }
 
   ADValue VisitExpr_(const ConstantNode* op) final {
@@ -172,6 +173,23 @@ struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
   }
 };
 
+Type GradRetType(const Function& f) {
+  // if type annotations are provided, we will construct a ret type;
+  // otherwise, leave it to be inferred
+  if (!f->ret_type.defined()) {
+    return Type();
+  }
+  std::vector<Type> vt;
+  for (const auto& p : f->params) {
+    if (!p->type_annotation.defined()) {
+      return Type();
+    }
+    vt.push_back(p->type_annotation);
+  }
+
+  return TupleTypeNode::make({f->ret_type, TupleTypeNode::make(vt)});
+}
+
 Expr FirstOrderGradient(const Expr& re, const Module& mod) {
   // Currently we first remove any global functions for the first
   // order case.
@@ -182,7 +200,7 @@ Expr FirstOrderGradient(const Expr& re, const Module& mod) {
 
   // We will then build a sequence of lets which implement reverse mode.
   Expr body = LetList::With([&](LetList* ll) {
-    ReverseAD reverse_ad(ll);
+    FirstOrderReverseAD reverse_ad(ll);
     ADValue rev = reverse_ad(e);
     std::vector<ADValue> args;
     for (const auto& p : f->params) {
@@ -191,46 +209,131 @@ Expr FirstOrderGradient(const Expr& re, const Module& mod) {
     auto c = rev->get<ADFunction>().func(args, Attrs(), {});
     const auto& res = c->get<ADTensor>();
     Expr grad = LetList::With([&](LetList* ll) {
-        res.reverse = OneLike(res.foward);
-        for (auto it = reverse_ad.backprop_actions.rbegin();
-             it != reverse_ad.backprop_actions.rend();
-             ++it) {
-          (*it)(ll);
+      res.reverse = OnesLike(res.forward);
+      for (auto it = reverse_ad.backprop_actions.rbegin();
+           it != reverse_ad.backprop_actions.rend();
+           ++it) {
+        (*it)(ll);
+      }
+      std::vector<Expr> grad_res;
+      for (const auto& a : args) {
+        grad_res.push_back(a->get<ADTensor>().reverse);
+      }
+      return TupleNode::make(grad_res);
+    });
+    return Pair(res.forward, grad);
+  });
+
+  return FunctionNode::make(f->params, body, GradRetType(GetRef<Function>(f)), {});
+}
+
+TVM_REGISTER_API("relay._ir_pass.first_order_gradient")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args.size(), 2);
+  *ret = FirstOrderGradient(args[0], args[1]);
+});
+
+struct ReverseADType : TypeMutator {
+  Type VisitType_(const TensorTypeNode* ttn) final {
+    Type t = GetRef<Type>(ttn);
+    return TupleTypeNode::make({t, RefTypeNode::make(t)});
+  }
+};
+
+struct ReverseAD : ExprMutator {
+  Var bp;
+  const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
+
+  ReverseAD(const Var& bp) : bp(bp) { }
+
+  Expr VisitExpr_(const OpNode* op) final {
+    LOG(FATAL) << "op should only be inside call";
+    throw;
+  }
+
+  Expr VisitExpr_(const CallNode* op) final {
+    if (const OpNode* op_node = op->op.as<OpNode>()) {
+      Op op_ref = GetRef<Op>(op_node);
+      CHECK(rev_map.count(op_ref))
+        << op_node->name << " does not have reverse mode defined";
+      return LetList::With([&](LetList* ll) {
+        std::vector<Var> args;
+        for (const auto& arg : op->args) {
+          args.push_back(ll->Push(VisitExpr(arg)));
         }
-        std::vector<Expr> grad_res;
-        for (const auto& a : args) {
-          grad_res.push_back(a->get<ADTensor>().reverse);
+        std::vector<Expr> orig_args;
+        for (const auto& arg : args) {
+          orig_args.push_back(GetField(VisitExpr(arg), 0));
         }
-        return TupleNode::make(grad_res);
+        Expr orig = CallNode::make(op->op, orig_args, op->attrs, op->type_args);
+        Var orig_var = ll->Push(orig);
+        auto ref = ll->Push(RefCreateNode::make(ZerosLike(orig_var)));
+        auto bpv = ll->Push(RefReadNode::make(bp));
+        Expr nbp = FunctionNode::make(
+          {},
+          LetList::With([&](LetList* ll) {
+              tvm::Array<Expr> rev = rev_map[op_ref](orig, ll->Push(RefReadNode::make(ref)));
+              CHECK(args.size() == rev.size());
+              for (size_t i = 0; i < args.size(); ++i) {
+                ll->Push(RefWriteNode::make(GetField(args[i], 1),
+                                            Add(ll->Push(RefReadNode::make(GetField(args[i], 1))),
+                                                rev[i])));
+              }
+            return CallNode::make(bpv, {});
+            }),
+          TupleTypeNode::make({}),
+          {});
+        ll->Push(RefWriteNode::make(bp, nbp));
+        return Pair(orig_var, ref);
       });
-    return Pair(res.foward, grad);
-  });
-
-  // if type annotations are provided, we will construct a ret type;
-  // otherwise, leave it to be inferred
-  Type ret_type = Type();
-  std::vector<Type> vt;
-  bool missing = !f->ret_type.defined();
-  for (const auto& p : f->params) {
-    if (missing || !p->type_annotation.defined()) {
-      missing = true;
-      break;
     }
-    vt.push_back(p->type_annotation);
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const ConstantNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    return Pair(e, RefCreateNode::make(ZerosLike(e)));
   }
 
-  if (!missing) {
-    ret_type = TupleTypeNode::make({f->ret_type, TupleTypeNode::make(vt)});
+  Type VisitType(const Type& t) final {
+    return t.defined() ? ReverseADType()(t) : t;
   }
+};
 
-  return FunctionNode::make(f->params, body, ret_type, {});
+Expr BPEmpty() {
+  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
+  return RefCreateNode::make(unitF);
 }
 
-TVM_REGISTER_API("relay._ir_pass.first_order_gradient")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-      CHECK_EQ(args.size(), 2);
-      *ret = FirstOrderGradient(args[0], args[1]);
-    });
+Expr Gradient(const Expr& re, const Module& mod) {
+  auto e = DeGlobal(mod, re);
+  auto f = e.as<FunctionNode>();
+  CHECK(f) << "input need to be a function";
+  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  Expr body = LetList::With([&](LetList* ll) {
+    Var bp = ll->Push(BPEmpty());
+    Expr rev = ReverseAD(bp)(e);
+    std::vector<Expr> args;
+    for (const auto& p : f->params) {
+      args.push_back(ll->Push(Pair(p, RefCreateNode::make(ZerosLike(p)))));
+    }
+    auto c = ll->Push(CallNode::make(rev, args));
+    ll->Push(RefWriteNode::make(GetField(c, 1), OnesLike(GetField(c, 0))));
+    ll->Push(CallNode::make(RefReadNode::make(bp), {}));
+    std::vector<Expr> ret;
+    for (const auto& a : args) {
+      ret.push_back(RefReadNode::make(GetField(a, 1)));
+    }
+    return Pair(GetField(c, 0), TupleNode::make(ret));
+  });
+  return FunctionNode::make(f->params, body, GradRetType(GetRef<Function>(f)), {});
+}
+
+TVM_REGISTER_API("relay._ir_pass.gradient")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args.size(), 2);
+  *ret = Gradient(args[0], args[1]);
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 5709d7d0ea31..e801cdc37d12 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -11,24 +11,93 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
-#include "../op/layout.h"
+#include <tvm/data_layout.h>
 
 namespace tvm {
 namespace relay {
 
-namespace {
+namespace mac_count {
 
-bool IsConv2DNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<Conv2DAttrs>();
+inline int64_t GetCartesianProd(Array<IndexExpr> arr) {
+  int64_t ret = 1;
+  for (size_t i = 0; i < arr.size(); i++) {
+    const auto* intImm = arr[i].as<IntImm>();
+    ret *= static_cast<int64_t>(intImm->value);
+  }
+  return ret;
+}
+
+/*
+ * \brief Preparation function for MAC count.
+ * \param call_node The call node.
+ * \return The number of MACs.
+ */
+using FMacCount = runtime::TypedPackedFunc<
+  int64_t(const Call& call_node)>;
+
+//----------------------------------------------
+// Per operator defs for MAC count
+//----------------------------------------------
+
+int64_t ConvMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK(args.size() == 2)
+      << "The number of input arguments of a CONV 2D node should be 2.";
+  const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
+  const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> data_shape = data_type->shape;
+  std::string data_layout = conv_2d_attr->data_layout;
+  int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
+  int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
+  CHECK(C_ind != -1)
+      << "There is no input channel dimension.";
+  int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
+  if (c_ind != -1)
+    input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
+  Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
+  CHECK(kernel_size.size() == 2)
+      << "The dimension of the kernel size in Conv 2D should be 2.";
+  const auto* expr = call_node->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> output_tensor = expr->shape;
+  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+      << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
+  int64_t count = input_channel * GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
+  return count;
 }
 
-bool IsDenseNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<DenseAttrs>();
+int64_t DenseMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK(args.size() == 2)
+      << "The number of input arguments of a Dense node should be 2.";
+  const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
+  const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> data_shape = data_type->shape;
+  Array<IndexExpr> weight_shape = weight_type->shape;
+  CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
+      << "The dimension of an input tensor to Dense node should be 2.";
+  int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImm>()->value);
+  int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
+  int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
+  int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
+  CHECK(d2 == d4)
+      << "The dimensions of input arguments do not match.";
+  int64_t count = d1 * d2 * d3;
+  return count;
 }
 
-}  // namespace
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FMacCount>("FMacCount", ConvMacCount);
+
+RELAY_REGISTER_OP("nn.dense")
+.set_attr<FMacCount>("FMacCount", DenseMacCount);
 
 class MacCounter : private ExprVisitor {
  public:
@@ -44,91 +113,13 @@ class MacCounter : private ExprVisitor {
 
  private:
   void VisitExpr_(const CallNode* call_node) final {
-    if (IsConv2DNode(call_node)) {
-      count_ += ComputeConv2DMacs(call_node);
-    } else if (IsDenseNode(call_node)) {
-      count_ += ComputeDenseMacs(call_node);
-    }
+    static const auto& fprep =
+        Op::GetAttr<FMacCount>("FMacCount");
+    auto f = fprep.get(call_node->op, nullptr);
+    if (f != nullptr) count_ += f(GetRef<Call>(call_node));
     ExprVisitor::VisitExpr_(call_node);
   }
 
-  /*
-   * \brief Get the number of MACs of a CONV 2D node.
-   * \param call_node The CONV 2D call node.
-   * \return The number of MACs.
-   */
-  int64_t ComputeConv2DMacs(const CallNode* call_node) {
-    CHECK(IsConv2DNode(call_node))
-        << "The input call node must be a CONV 2D node.";
-    if (!call_node->checked_type_.defined()) {
-      LOG(WARNING) << "The infer type pass should be called before the mac count pass";
-      return 0;
-    }
-    Array<Expr> args = call_node->args;
-    CHECK(args.size() == 2)
-        << "The number of input arguments of a CONV 2D node should be 2.";
-    const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
-    const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> data_shape = data_type->shape;
-    std::string data_layout = conv_2d_attr->data_layout;
-    int32_t C_ind = Layout(data_layout).Indexof('C');
-    int32_t c_ind = Layout(data_layout).Indexof('c');
-    CHECK(C_ind != -1)
-        << "There is no input channel dimension.";
-    int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
-    if (c_ind != -1)
-      input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
-    Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-    CHECK(kernel_size.size() == 2)
-        << "The dimension of the kernel size in Conv 2D should be 2.";
-    const auto* expr = call_node->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> output_tensor = expr->shape;
-    CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
-        << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
-    int64_t count = input_channel * GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-    return count;
-  }
-
-  /*
-   * \brief Get the number of MACs of a Dense node.
-   * \param call_node The Dense call node.
-   * \return The number of MACs.
-   */
-  int64_t ComputeDenseMacs(const CallNode* call_node) {
-    CHECK(IsDenseNode(call_node))
-        << "The input call node must be a Dense node.";
-    if (!call_node->checked_type_.defined()) {
-      LOG(WARNING) << "The infer type pass should be called before the mac count pass";
-      return 0;
-    }
-    Array<Expr> args = call_node->args;
-    CHECK(args.size() == 2)
-        << "The number of input arguments of a Dense node should be 2.";
-    const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
-    const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> data_shape = data_type->shape;
-    Array<IndexExpr> weight_shape = weight_type->shape;
-    CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
-        << "The dimension of an input tensor to Dense node should be 2.";
-    int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImm>()->value);
-    int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
-    int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
-    int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
-    CHECK(d2 == d4)
-        << "The dimensions of input arguments do not match.";
-    int64_t count = d1 * d2 * d3;
-    return count;
-  }
-
-  int64_t GetCartesianProd(Array<IndexExpr> arr) {
-    int64_t ret = 1;
-    for (size_t i = 0; i < arr.size(); i++) {
-      const auto* intImm = arr[i].as<IntImm>();
-      ret *= static_cast<int64_t>(intImm->value);
-    }
-    return ret;
-  }
-
   int64_t count_;
 };
 
@@ -141,5 +132,6 @@ TVM_REGISTER_API("relay._ir_pass.GetTotalMacNumber")
   *ret = GetTotalMacNumber(args[0]);
 });
 
+}  // namespace mac_count
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 08fc017f41eb..96038745474e 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -8,13 +8,13 @@
 #ifndef TVM_RELAY_PASS_PATTERN_UTIL_H_
 #define TVM_RELAY_PASS_PATTERN_UTIL_H_
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/attrs/nn.h>
 #include <string>
-#include "../op/layout.h"
 
 
 namespace tvm {
@@ -155,9 +155,8 @@ inline bool IsDepthwiseConv2D(const Call& call,
                               const Conv2DAttrs* param,
                               const Layout& kernel_layout) {
   static const Layout kOIHW("OIHW");
-  auto wshape = ConvertLayout(
-      call->args[1]->type_as<TensorTypeNode>()->shape,
-      kernel_layout, kOIHW);
+  const auto bilayout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  auto wshape = bilayout.ForwardShape(call->args[1]->type_as<TensorTypeNode>()->shape);
   return is_const_int(wshape[0], param->groups) &&
       is_const_int(wshape[1], 1);
 }
@@ -192,6 +191,21 @@ inline Constant MakeConstantScalar(DataType dtype, T value) {
   return ConstantNode::make(arr);
 }
 
+/*!
+ * \brief Check if two expressions are equal scalars.
+ * \param a The expression to be checked.
+ * \param b The expression to be checked
+ * \return Whether two expressions are equal scalars.
+ */
+inline bool IsEqualScalar(const Expr& a, const Expr& b) {
+  const auto* constant_a = a.as<ConstantNode>();
+  const auto* constant_b = b.as<ConstantNode>();
+  if (!constant_a || !constant_b || !constant_a->is_scalar() || !constant_b->is_scalar()) {
+    return false;
+  }
+  return AlphaEqual(a, b);
+}
+
 inline Expr GetField(Expr t, size_t i) {
   return TupleGetItemNode::make(t, i);
 }
@@ -285,12 +299,12 @@ inline Expr Divide(Expr lhs, Expr rhs) {
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
-inline Expr ZeroLike(Expr e) {
+inline Expr ZerosLike(Expr e) {
   static const Op& op = Op::Get("zeros_like");
   return CallNode::make(op, {e});
 }
 
-inline Expr OneLike(Expr e) {
+inline Expr OnesLike(Expr e) {
   static const Op& op = Op::Get("ones_like");
   return CallNode::make(op, {e});
 }
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index ff6c8ea5c187..a1b93546b84f 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -124,7 +124,7 @@ TVM_REGISTER_API("relay._quantize.annotate")
       }
       return e;
     };
-  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, nullptr);
+  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, fmulti_ref);
 });
 
 
@@ -329,9 +329,11 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 
 /* \brief Unify the dom scale of arguments */
-Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
+Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
+                            const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
+  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;
@@ -344,10 +346,19 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
   }
 
   // unify the data type
+  CHECK_EQ(ref_args.size(), args.size());
   DataType dtype = cfg->dtype_activation;
   for (size_t i = 0; i < ret.size(); ++i) {
+    auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
       ret.Set(i, Cast(ret[i], dtype));
+    } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
+               ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
+      auto new_arg = Cast(ret[i], cfg->dtype_input);
+      if (cfg->use_stop_fusion) {
+        new_arg = StopFusion(new_arg);
+      }
+      ret.Set(i, Cast(new_arg, dtype));
     }
   }
 
@@ -371,7 +382,7 @@ Expr AddRealize(const Call& ref_call,
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(new_args, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
@@ -387,15 +398,19 @@ Expr ConcatenateRealize(const Call& ref_call,
                         const Array<Expr>& new_args,
                         const NodeRef& ctx) {
   CHECK_EQ(new_args.size(), 1);
+  CHECK_EQ(ref_call->args.size(), 1);
 
   const auto* tuple = new_args[0].as<TupleNode>();
+  const auto* ref_tuple = ref_call->args[0].as<TupleNode>();
   CHECK(tuple);
+  CHECK(ref_tuple);
   const Array<Expr>& arr = tuple->fields;
+  const Array<Expr>& ref_arr = ref_tuple->fields;
 
   if (arr[0].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(arr, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_arr, arr, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, {TupleNode::make(ret_args)});
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   } else {
@@ -530,7 +545,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "skip_k_conv==" << op->skip_k_conv << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
-  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
+  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops << ", ";
+  p->stream << "use_stop_fusion==" << op->use_stop_fusion;
   p->stream << ")";
 });
 
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index b1a15308d914..ed0a8b10a574 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -110,6 +110,7 @@ class QConfigNode : public Node {
   bool round_for_shift = true;
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
+  bool use_stop_fusion = true;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("nbit_input", &nbit_input);
@@ -123,6 +124,7 @@ class QConfigNode : public Node {
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
+    v->Visit("use_stop_fusion", &use_stop_fusion);
   }
 
   static constexpr const char* _type_key = "relay.quantize.QConfig";
diff --git a/src/relay/pass/to_anf.cc b/src/relay/pass/to_a_normal_form.cc
similarity index 74%
rename from src/relay/pass/to_anf.cc
rename to src/relay/pass/to_a_normal_form.cc
index 6d65fe449fb0..46a4b92ac9b9 100644
--- a/src/relay/pass/to_anf.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -120,6 +120,22 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
     Depend(n, t->tuple);
   }
 
+  void VisitExpr_(const RefCreateNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->value);
+  }
+
+  void VisitExpr_(const RefReadNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+  }
+
+  void VisitExpr_(const RefWriteNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+    Depend(n, r->value);
+  }
+
   void VisitExpr_(const IfNode* i) final {
     DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(i)];
     DependencyGraph::Node* t = NewNode(true);
@@ -150,6 +166,21 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
     graph_.post_dfs_order.push_back(b);
   }
 
+  void VisitExpr_(const MatchNode* m) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(m)];
+    Depend(n, m->data);
+    std::vector<DependencyGraph::Node*> v;
+    for (const Clause& c : m->clauses) {
+      DependencyGraph::Node* b = NewNode(true);
+      Depend(n, b);
+      Depend(b, c->rhs);
+      v.push_back(b);
+    }
+    for (auto it = v.rbegin(); it != v.rend(); ++it) {
+      graph_.post_dfs_order.push_back(*it);
+    }
+  }
+
   void VisitExpr_(const VarNode* v) final { }
 
   void VisitExpr_(const GlobalVarNode* v) final { }
@@ -157,13 +188,15 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
   void VisitExpr_(const ConstantNode* c) final { }
 
   void VisitExpr_(const OpNode* o) final { }
+
+  void VisitExpr_(const ConstructorNode* c) final { }
 };
 
 DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body) {
   return Creator(arena).Create(body);
 }
 
-Expr ToANF(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
+Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
 
 struct ScopeNode;
 using Scope = std::shared_ptr<ScopeNode>;
@@ -223,13 +256,17 @@ bool IsPrimitiveFunction(const Expr& e) {
   return e.as<FunctionNode>() && Downcast<Function>(e)->IsPrimitive();
 }
 
+/* Special care is needed to handle local recursion.
+ * Fill additionally take a (possibly null) Var argument,
+ * If it is not null, Fill is required to bind the transformed result to that var.
+ */
 class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
  public:
-  static Expr ToANF(const Expr& e,
-                    const Module& m,
-                    const DependencyGraph& dg,
-                    std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-                    std::set<GlobalVar>* gv) {
+  static Expr ToANormalForm(const Expr& e,
+                            const Module& m,
+                            const DependencyGraph& dg,
+                            std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
+                            std::set<GlobalVar>* gv) {
     Fill fi(m, dg, node_scope, gv);
     return fi.GetScope(e)->ll->Get(fi.VisitExpr(e));
   }
@@ -274,12 +311,18 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 
   Expr VisitExpr(const Expr& e) {
-    Var v = VarNode::make(std::string("x"), IncompleteTypeNode::make(Kind::kType));
-    return this->VisitExpr(e, v);
+    return this->VisitExpr(e, Var());
+  }
+
+  Expr Atomic(const Expr& orig, const Expr& now, const Var& v) {
+    return v.defined() ? GetScope(orig)->ll->Push(v, now) : now;
   }
 
   Expr Compound(const Expr& orig, const Expr& now, const Var& v) {
-    return GetScope(orig)->ll->Push(v, now);
+    Var var = v.defined() ?
+      v :
+      VarNode::make(std::string("x"), IncompleteTypeNode::make(Kind::kType));
+    return GetScope(orig)->ll->Push(var, now);
   }
 
   Expr VisitExpr_(const CallNode* c, const Var& v) final {
@@ -305,6 +348,21 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
     return Compound(e, TupleGetItemNode::make(VisitExpr(t->tuple), t->index), v);
   }
 
+  Expr VisitExpr_(const RefCreateNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefCreateNode::make(VisitExpr(r->value)), v);
+  }
+
+  Expr VisitExpr_(const RefReadNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefReadNode::make(VisitExpr(r->ref)), v);
+  }
+
+  Expr VisitExpr_(const RefWriteNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefWriteNode::make(VisitExpr(r->ref), VisitExpr(r->value)), v);
+  }
+
   Expr VisitExpr_(const IfNode* i, const Var& v) final {
     Expr e = GetRef<Expr>(i);
     Expr ret = IfNode::make(VisitExpr(i->cond),
@@ -341,24 +399,43 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 
   Expr VisitExpr_(const VarNode* vn, const Var& v) final {
-    return GetRef<Expr>(vn);
+    Expr e = GetRef<Expr>(vn);
+    return Atomic(e, e, v);
   }
 
   Expr VisitExpr_(const GlobalVarNode* gvn, const Var& v) final {
     GlobalVar gv = GetRef<GlobalVar>(gvn);
     if (visited_->count(gv) == 0) {
       visited_->insert(gv);
-      mod_->Update(gv, Downcast<Function>(relay::ToANF(mod_->Lookup(gv), mod_, visited_)));
+      mod_->Update(gv, Downcast<Function>(relay::ToANormalForm(mod_->Lookup(gv), mod_, visited_)));
     }
-    return gv;
+    return Atomic(gv, gv, v);
   }
 
   Expr VisitExpr_(const OpNode* op, const Var& v) final {
-    return GetRef<Expr>(op);
+    Expr e = GetRef<Expr>(op);
+    return Atomic(e, e, v);
+  }
+
+  Expr VisitExpr_(const ConstructorNode* c, const Var& v) final {
+    Expr e = GetRef<Expr>(c);
+    return Atomic(e, e, v);
+  }
+
+  Expr VisitExpr_(const MatchNode* m, const Var& v) final {
+    Expr e = GetRef<Expr>(m);
+    Expr data = VisitExpr(m->data);
+    std::vector<Clause> clauses;
+    for (const Clause& c : m->clauses) {
+      clauses.push_back(ClauseNode::make(
+        c->lhs,
+        GetSubScope(e, 1 + clauses.size())->ll->Get(VisitExpr(c->rhs))));
+    }
+    return Compound(e, MatchNode::make(data, clauses), v);
   }
 };
 
-Expr ToANFAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   /* When you lift a lambda, what is inside is also being lift.
    *
    * So we must determine the scope of the lambda before determining the scope of it's body.
@@ -381,29 +458,29 @@ Expr ToANFAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
    * We do an additional pass to fill all the LetList and we are done.
    */
   std::unordered_map<DependencyGraph::Node*, Scope> node_scope = CalcScope(dg);
-  return Fill::ToANF(e, m, dg, &node_scope, gv);
+  return Fill::ToANormalForm(e, m, dg, &node_scope, gv);
 }
 
-Expr ToANF(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   if (const auto* f = e.as<FunctionNode>()) {
     return FunctionNode::make(f->params,
-                              ToANFAux(f->body, m, gv),
+                              ToANormalFormAux(f->body, m, gv),
                               f->ret_type,
                               f->type_params,
                               f->attrs);
   } else {
-    return ToANFAux(e, m, gv);
+    return ToANormalFormAux(e, m, gv);
   }
 }
 
-Expr ToANF(const Expr& e, const Module& m) {
+Expr ToANormalForm(const Expr& e, const Module& m) {
   std::set<GlobalVar> gv;
-  return ToANF(e, m, &gv);
+  return ToANormalForm(e, m, &gv);
 }
 
-TVM_REGISTER_API("relay._ir_pass.to_anf")
+TVM_REGISTER_API("relay._ir_pass.to_a_normal_form")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ToANF(args[0], args[1]);
+    *ret = ToANormalForm(args[0], args[1]);
   });
 
 }  // namespace relay
diff --git a/src/relay/pass/to_graph_normal_form.cc b/src/relay/pass/to_graph_normal_form.cc
new file mode 100644
index 000000000000..bc1630263e3f
--- /dev/null
+++ b/src/relay/pass/to_graph_normal_form.cc
@@ -0,0 +1,66 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file to_gnf.cc
+ *
+ * \brief Turn A normal form into graph normal form.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "let_list.h"
+
+namespace tvm {
+namespace relay {
+
+class UseVarVisitor : public ExprVisitor {
+ public:
+  explicit UseVarVisitor(const Var& v) : v(v) { }
+
+  static bool UseVar(const Var& v, const Expr& e) {
+    UseVarVisitor uv(v);
+    uv(e);
+    return uv.use_var;
+  }
+
+ private:
+  bool use_var = false;
+  Var v;
+
+  void VisitExpr_(const VarNode* vn) override {
+    use_var = use_var || (v == GetRef<Var>(vn));
+  }
+};
+
+class GNF : public ExprMutator {
+ private:
+  std::unordered_map<Var, Expr, NodeHash, NodeEqual> var_map_;
+  Expr VisitExpr_(const VarNode* vn) override {
+    Var v = GetRef<Var>(vn);
+    return var_map_.count(v) == 0 ? v : var_map_.at(v);
+  }
+
+  static bool UseVar(const Var& v, const Expr& e) {
+    return UseVarVisitor::UseVar(v, e);
+  }
+
+  static Expr WrapRec(const Var& var, const Expr& val) {
+    return UseVar(var, val) ? LetNode::make(var, val, var) : val;
+  }
+
+  Expr VisitExpr_(const LetNode* ln) override {
+    var_map_.insert(std::pair<Var, Expr>(ln->var, VisitExpr(WrapRec(ln->var, ln->value))));
+    return VisitExpr(ln->body);
+  }
+};
+
+Expr ToGraphNormalForm(const Expr& e) {
+  return GNF()(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.to_graph_normal_form")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = ToGraphNormalForm(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index fa3cea610c68..ea6b9a95da50 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -53,7 +53,7 @@ bool TupleGetItemRel(const Array<Type>& types,
   const auto* param = attrs.as<TupleGetItemAttrs>();
   CHECK(param != nullptr);
   CHECK_GE(param->index, 0);
-  CHECK_LT(param->index,  data->fields.size());
+  CHECK_LT(param->index, data->fields.size());
   reporter->Assign(types[1], data->fields[param->index]);
   return true;
 }
@@ -121,7 +121,17 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   Type Unify(const Type& t1, const Type& t2, const NodeRef& expr) {
     // TODO(tqchen, jroesch): propagate span to solver
     try {
-      return solver_.Unify(t1, t2, expr);
+      // instantiate higher-order func types when unifying because
+      // we only allow polymorphism at the top level
+      Type first = t1;
+      Type second = t2;
+      if (auto* ft1 = t1.as<FuncTypeNode>()) {
+        first = InstantiateFuncType(ft1);
+      }
+      if (auto* ft2 = t2.as<FuncTypeNode>()) {
+        second = InstantiateFuncType(ft2);
+      }
+      return solver_.Unify(first, second, expr);
     } catch (const dmlc::Error &e) {
       this->ReportFatalError(
         expr,
@@ -351,6 +361,20 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     return Downcast<FuncType>(inst_ty);
   }
 
+  // instantiates starting from incompletes
+  FuncType InstantiateFuncType(const FuncTypeNode* fn_ty) {
+    if (fn_ty->type_params.size() == 0) {
+      return GetRef<FuncType>(fn_ty);
+    }
+
+    Array<Type> type_args;
+    for (size_t i = 0; i < fn_ty->type_params.size(); i++) {
+      type_args.push_back(IncompleteTypeNode::make(Kind::kType));
+    }
+    return InstantiateFuncType(fn_ty, type_args);
+  }
+
+
   void AddTypeArgs(const Expr& expr, Array<Type> type_args) {
     auto type_info = type_map_.find(expr);
     if (type_info == type_map_.end()) {
@@ -464,6 +488,9 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       arg_types.push_back(GetType(param));
     }
     Type rtype = GetType(f->body);
+    if (auto* ft = rtype.as<FuncTypeNode>()) {
+      rtype = InstantiateFuncType(ft);
+    }
     if (f->ret_type.defined()) {
       rtype = this->Unify(f->ret_type, rtype, GetRef<Function>(f));
     }
@@ -724,7 +751,7 @@ Expr InferType(const Expr& expr, const Module& mod_ref) {
     // FromExpr wraps a naked expression as a function, we will unbox
     // it here.
     if (expr.as<FunctionNode>()) {
-      return func;
+      return std::move(func);
     } else {
       return func->body;
     }
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index fd15c91e79f7..abbd82977499 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -400,11 +400,8 @@ Type TypeSolver::Unify(const Type& dst, const Type& src, const NodeRef&) {
 }
 
 void TypeSolver::ReportError(const Error& err, const NodeRef& location)  {
-    this->err_reporter_->ReportAt(
-      this->current_func,
-      location,
-      err);
-  }
+  err_reporter_->ReportAt(current_func, location, err);
+}
 
 // Add type constraint to the solver.
 void TypeSolver::AddConstraint(const TypeConstraint& constraint, const NodeRef& loc) {
@@ -479,8 +476,8 @@ bool TypeSolver::Solve() {
       rnode->resolved = false;
       this->ReportError(
           RELAY_ERROR(
-            "an internal invariant was violdated while" \
-            "typechecking your program" <<
+            "an internal invariant was violdated while " \
+            "typechecking your program " <<
             err.what()), rnode->location);
     }
 
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index d9c6b617ca5f..159e073673da 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pattern_functor.h>
 #include <unordered_set>
 
 namespace tvm {
@@ -12,7 +13,7 @@ namespace relay {
 
 
 //! brief make sure each Var is bind at most once.
-class WellFormedChecker : private ExprVisitor {
+class WellFormedChecker : private ExprVisitor, PatternVisitor {
   bool well_formed = true;
 
   std::unordered_set<Var, NodeHash, NodeEqual> s;
@@ -39,6 +40,14 @@ class WellFormedChecker : private ExprVisitor {
     CheckWellFormed(f->body);
   }
 
+  void VisitPattern(const Pattern& p) final {
+    PatternVisitor::VisitPattern(p);
+  }
+
+  void VisitVar(const Var& v) final {
+    Check(v);
+  }
+
  public:
   bool CheckWellFormed(const Expr& e) {
     this->VisitExpr(e);
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 7140a647070f..71a869e13ae6 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -38,6 +38,65 @@ class GraphRuntimeDebug : public GraphRuntime {
     return time;
   }
 
+  /*!
+   * \brief Run each operation in the graph and print out the runtime per op.
+   * \param number The number of times to run this function for taking average.
+   * \param repeat The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warmed up and will be discarded in case
+            there is lazy initialization.
+   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+   */
+  void RunIndividual(int number, int repeat, int min_repeat_ms) {
+    // warmup run
+    GraphRuntime::Run();
+
+    std::vector<double> time_per_op(op_execs_.size(), 0);
+    for (int i = 0; i < repeat; ++i) {
+      std::chrono::time_point<
+        std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
+      double duration_ms = 0.0;
+      do {
+        std::fill(time_per_op.begin(), time_per_op.end(), 0);
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1),
+                       number * 1.618));  // 1.618 is chosen by random
+        }
+        tbegin = std::chrono::high_resolution_clock::now();
+        for (int k = 0; k < number; k++) {
+          for (size_t index = 0; index < op_execs_.size(); ++index) {
+            if (op_execs_[index]) {
+              const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
+              auto op_tbegin = std::chrono::high_resolution_clock::now();
+              op_execs_[index]();
+              TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+              auto op_tend = std::chrono::high_resolution_clock::now();
+              double op_duration = std::chrono::duration_cast<
+                  std::chrono::duration<double> >(op_tend - op_tbegin).count();
+              time_per_op[index] += op_duration * 1000;  // ms
+            }
+          }
+        }
+        tend = std::chrono::high_resolution_clock::now();
+        duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
+            (tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
+      LOG(INFO) << "Repeat: " << i;
+      int op = 0;
+      for (size_t index = 0; index < time_per_op.size(); index++) {
+        if (op_execs_[index]) {
+          time_per_op[index] /= number;
+          LOG(INFO) << "Op #" << op++ << ": " << time_per_op[index] << " ms/iter";
+        }
+      }
+    }
+  }
+
   /*!
    * \brief Run each operation and get the output.
    * \param index The index of op which needs to be returned.
@@ -119,6 +178,16 @@ PackedFunc GraphRuntimeDebug::GetFunction(
           this->DebugGetNodeOutput(args[0], args[1]);
         }
       });
+  } else if (name == "run_individual") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int number = args[0];
+      int repeat = args[1];
+      int min_repeat_ms = args[2];
+      CHECK_GT(number, 0);
+      CHECK_GT(repeat, 0);
+      CHECK_GE(min_repeat_ms, 0);
+      this->RunIndividual(number, repeat, min_repeat_ms);
+    });
   } else {
     return GraphRuntime::GetFunction(name, sptr_to_self);
   }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 6bb0948bca91..a5ad66b2def4 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -110,6 +110,10 @@ void* OpenCLWorkspace::AllocDataSpace(
 }
 
 void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  // We have to make sure that the memory object is not in the command queue
+  // for some OpenCL platforms.
+  OPENCL_CALL(clFinish(this->GetQueue(ctx)));
+
   cl_mem mptr = static_cast<cl_mem>(ptr);
   OPENCL_CALL(clReleaseMemObject(mptr));
 }
@@ -233,6 +237,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
   if (context != nullptr) return;
+  this->type_key = type_key;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
   if (platform_ids.size() == 0) {
@@ -250,7 +255,6 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
     }
     if (devices_matched.size() > 0) {
-      this->type_key = type_key;
       this->platform_id = platform_id;
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index 0802d405bbe4..eecced8d90ab 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -1,7 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 
 namespace {
 using namespace tvm::ir;
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index abe26fabe9ea..83c0ba602927 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -168,7 +168,7 @@ namespace tvm {
 namespace runtime {
 
 template<>
-struct extension_class_info<test::IntVector> {
+struct extension_type_info<test::IntVector> {
   static const int code = kExtBegin + 1;
 };
 }  // runtime
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index cb746e65660b..ea1a8427e61a 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -107,6 +107,24 @@ TEST(Pattern, Basic) {
   }
 }
 
+TEST(Pattern, Integer) {
+  using namespace tvm;
+  tvm::Var tx, ty;
+  arith::PVar<Integer> c;
+  arith::PVar<Var> v;
+  {
+    // We can match integer and Var, both of which are
+    // special case container of Expr
+    CHECK((v * c).Match(tx * 3));
+    CHECK_EQ(c.Eval()->value, 3);
+    CHECK((v * 3).Match(tx * 3));
+  }
+  // cannot match c to ty
+  CHECK(!(v * c).Match(tx * ty));
+  // cannot match tx + 1 to v
+  CHECK(!(v * c).Match((tx + 1) * 3));
+}
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
deleted file mode 100644
index b810d63ee4b1..000000000000
--- a/tests/cpp/unittest.mk
+++ /dev/null
@@ -1,12 +0,0 @@
-GTEST_LIB=$(GTEST_PATH)/lib/
-GTEST_INC=$(GTEST_PATH)/include/
-
-TEST_SRC = $(wildcard tests/cpp/*_test.cc)
-TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
-
-tests/cpp/%_test: tests/cpp/%_test.cc lib/libtvm.so
-	$(CXX) -std=c++11 $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
-	$(CXX) -std=c++11 $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^)  \
-		-L$(GTEST_LIB)  $(LDFLAGS) -lgtest -Llib -ltvm
-
--include tests/cpp/*.d
diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index 18f526702ad8..355e2ad5acd1 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -65,7 +65,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance,consider-using-get
 
 [REPORTS]
 
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 81a12b041ed7..4679876c181b 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1,4 +1,5 @@
 import numpy as np
+import operator
 
 import tvm
 from tvm.contrib import graph_runtime
@@ -190,6 +191,194 @@ def test_forward_argmin():
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
 
+def test_forward_slice():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 3))
+    mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
+
+def test_forward_where():
+    cond = mx.sym.var('cond')
+    x = mx.sym.var('x')
+    y = mx.sym.var('y')
+    dshape = (2, 2)
+    dtype = 'float32'
+    mx_sym = mx.sym.where(cond, x, y)
+    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
+    np_x = np.random.uniform(size=dshape).astype(dtype)
+    np_y = np.random.uniform(size=dshape).astype(dtype)
+    mx_cond = mx.nd.array(np_cond)
+    mx_x = mx.nd.array(np_x)
+    mx_y = mx.nd.array(np_y)
+    shapes = {'cond': dshape, 'x': dshape, 'y': dshape}
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
+    mod.bind(data_shapes=shapes.items(), for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
+
+    new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(new_sym)(np_cond, np_x, np_y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), mx_out)
+
+
+def test_forward_arange():
+    def _mx_symbol(F, start, stop, step):
+        if start is None and step is None:
+            sym = F.arange(stop)
+        elif start is None:
+            sym = F.arange(stop, step=step)
+        elif step is None:
+            sym = F.arange(start, stop)
+        else:
+            sym = F.arange(start, stop, step)
+        return sym
+
+    def verify(start, stop, step):
+        ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy()
+        mx_sym = _mx_symbol(mx.sym, start, stop, step)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)()
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+    verify(0, 20, None)
+    verify(0, 20, 2)
+    verify(1, 20, None)
+    verify(1, 20, 2)
+    verify(1, 20, 1.5)
+    verify(1, 20.5, None)
+    verify(1, 20, 3)
+    verify(20, 1, -1)
+    verify(20, 1, -1.5)
+
+def _mx_symbol(F, op_name, inputs):
+    op = getattr(F, op_name)
+    return op(*inputs)
+
+def test_forward_broadcast_ops():
+    for op in ["broadcast_add", "broadcast_sub", "broadcast_mul",
+               "broadcast_div", "broadcast_mod", "broadcast_maximum",
+               "broadcast_minimum", "broadcast_equal", "broadcast_not_equal",
+               "broadcast_greater", "broadcast_greater_equal",
+               "broadcast_lesser", "broadcast_lesser_equal"]:
+        a_shape = (3, 4, 5)
+        b_shape = (4, 5)
+        if op == "broadcast_mod":
+            dtype = 'int32'
+            a_np = np.random.randint(1, 100, size=a_shape).astype(dtype)
+            b_np = np.random.randint(1, 100, size=b_shape).astype(dtype)
+        else:
+            dtype = 'float32'
+            a_np = np.random.uniform(size=a_shape).astype(dtype)
+            b_np = np.random.uniform(size=b_shape).astype(dtype)
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
+        shapes = {'a': a_shape, 'b': b_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
+def test_forward_elemwise_ops():
+    for op in ["elemwise_add", "elemwise_sub", "elemwise_mul",
+               "elemwise_div", "maximum", "minimum"]:
+        shape = (3, 4, 5)
+        dtype = 'float32'
+        a_np = np.random.uniform(size=shape).astype(dtype)
+        b_np = np.random.uniform(size=shape).astype(dtype)
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
+        shapes = {'a': shape, 'b': shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
+def test_forward_scalar_ops():
+    for op in [operator.add, operator.sub, operator.mul, operator.truediv,
+               operator.pow, operator.lt, operator.le, operator.eq,
+               operator.ne, operator.gt, operator.ge]:
+        dtype='float32'
+        a_shape = (3, 4, 5)
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_scalar = 2.3
+        mx_sym = op(mx.sym.var('a'), b_scalar)
+        ref_res = op(mx.nd.array(a_np), b_scalar)
+        shapes = {'a': a_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    for op in ["maximum", "minimum"]:
+        dtype='float32'
+        a_shape = (3, 4, 5)
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_scalar = 2.3
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), b_scalar])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar])
+        shapes = {'a': a_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
+def test_forward_slice_axis():
+    def verify(shape, axis, begin, end):
+        data_np = np.random.uniform(size=shape).astype("float32")
+        ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end)
+        mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(data_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((3, 4), 0, 1, 2)
+    verify((3, 4), 0, 1, None)
+    verify((3, 4), 1, 0, 2)
+    verify((3, 4), 1, -3, -1)
+    verify((3, 4), -1, -3, -1)
+
+def test_forward_slice_like():
+    def verify(x_shape, y_shape, axes):
+        x_np = np.random.uniform(size=x_shape).astype("float32")
+        y_np = np.random.uniform(size=y_shape).astype("float32")
+        if axes is None:
+            ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np))
+            mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"))
+        else:
+            ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes)
+            mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(x_np, y_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((3, 4), (2, 3), None)
+    verify((3, 4), (2, 3), (0, 1))
+    verify((3, 4), (2, 3), (0))
+    verify((3, 4), (2, 3), (-1))
+
+def test_forward_l2_normalize():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.L2Normalization(data, mode="channel")
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -212,3 +401,11 @@ def test_forward_argmin():
     test_forward_zeros_like()
     test_forward_argmax()
     test_forward_argmin()
+    test_forward_where()
+    test_forward_arange()
+    test_forward_broadcast_ops()
+    test_forward_elemwise_ops()
+    test_forward_scalar_ops()
+    test_forward_slice_like()
+    test_forward_slice_axis()
+    test_forward_l2_normalize()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 0db6952d837d..84c431aaf342 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -682,6 +682,49 @@ def test_forward_pad():
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT")
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT", constant_values=1.0)
 
+#######################################################################
+# Logical operators
+# --------------------
+def test_logical_and():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_and(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_or():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_or(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_xor():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_xor(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_not():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        out = tf.logical_not(in1, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm(in_data1, 'in1:0', 'out:0')
+
+def test_forward_logical():
+    test_logical_and()
+    test_logical_or()
+    test_logical_xor()
+    test_logical_not()
+
 
 #######################################################################
 # Inception V3
@@ -1109,5 +1152,4 @@ def test_forward_rel_ops():
 
     # Relational ops
     test_forward_rel_ops()
-
-
+    test_forward_logical()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 3c048435fba8..0b314cced520 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -283,6 +283,53 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [-1])
 
 
+#######################################################################
+# Concatenation
+# -------------
+
+def _test_concatenation(data, axis):
+    """ One iteration of concatenation """
+
+    assert len(data) >= 1
+    need_transpose = False
+    if len(data[0].shape) == 1 or len(data[0].shape) == 2:
+        tvm_data = data
+    elif len(data[0].shape) == 3:
+        #need_transpose = True
+        tvm_data = [np.transpose(d, axes=(0, 2, 1)) for d in data]
+    elif len(data[0].shape) == 4:
+        need_transpose = True
+        tvm_data = [np.transpose(d, axes=(0, 3, 1, 2)) for d in data]
+    else:
+        raise NotImplementedError("Not support input shape {} of reshape : ".
+                                  format(str(len(data))))
+
+    with tf.Graph().as_default():
+        in_data = [
+            array_ops.placeholder(shape=tensor.shape, dtype=tensor.dtype, name="in_{}".format(idx))
+            for idx, tensor in enumerate(data)]
+        out = array_ops.concat(in_data, axis=axis)
+        name = ["in_{}:0".format(idx) for idx in range(len(data))]
+
+        compare_tflite_with_tvm(data, tvm_data, name, in_data, [out], need_transpose)
+
+
+def test_forward_concatenation():
+
+    _test_concatenation(
+        [np.arange(6).reshape((1, 2, 1, 3)),
+        np.arange(6).reshape((1, 2, 1, 3))], 1)
+
+    _test_concatenation(
+        [np.arange(6).reshape((3, 2)),
+         np.arange(6).reshape((3, 2))], 1)
+
+    _test_concatenation(
+        [np.arange(6).reshape((2, 1, 1, 3)),
+         np.arange(6).reshape((2, 1, 1, 3)),
+         np.arange(6).reshape((2, 1, 1, 3))], 1)
+
+
 #######################################################################
 # Squeeze
 # -------
@@ -340,6 +387,7 @@ def test_forward_softmax():
 #######################################################################
 # Mobilenet
 # ---------
+
 def test_forward_mobilenet():
     '''test mobilenet v1 tflite model'''
     # MobilenetV1
@@ -347,19 +395,43 @@ def test_forward_mobilenet():
     tflite_model_file = tf_testing.get_workload_official(
         "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
         "mobilenet_v1_1.0_224.tflite", temp)
-    tflite_model_buf = open(tflite_model_file, "rb").read()
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
     tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
     tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
+    temp.remove()
+
+#######################################################################
+# Inception V3
+# ------------
+
+def test_forward_inception_v3_net():
+    '''test inception v3 tflite model'''
+    # InceptionV3
+    temp = util.tempdir()
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz",
+        "inception_v3.tflite", temp)
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
+    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
+                                rtol=1e-5, atol=1e-5)
+    temp.remove()
 
 #######################################################################
 # Main
 # ----
 if __name__ == '__main__':
     # Transforms
+    test_forward_concatenation()
     test_forward_reshape()
     test_forward_squeeze()
 
@@ -370,3 +442,4 @@ def test_forward_mobilenet():
 
     # End to End
     test_forward_mobilenet()
+    test_forward_inception_v3_net()
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 5acae6c70295..a67ee7542e8a 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -43,6 +43,9 @@
 tmap = p.tmap
 size = p.size
 
+compose = p.compose
+iterate = p.iterate
+
 # this is an example of using the adt value in python side
 def count(n):
     assert isinstance(n, ConstructorValue)
@@ -93,6 +96,7 @@ def tree_to_dict(t):
 
 def test_nat_value():
     assert count(make_nat(10)) == 10
+    assert count(intrp.evaluate(s(s(z())))) == 2
 
 
 def test_nat_constructor():
@@ -577,6 +581,17 @@ def test_nested_pattern_match():
 
     assert count(res) == 2
 
+def test_compose():
+    n = relay.Var('n')
+    inc = relay.Function([n], s(n))
+    x = relay.Var('x')
+    res = intrp.evaluate(relay.Call(compose(inc, double), [s(s(z()))]))
+    assert count(res) == 5
+
+def test_iterate():
+    expr = relay.Call(iterate(double, build_nat(2)), [build_nat(3)])
+    res = intrp.evaluate(relay.Function([], expr)())
+    assert count(res) == 12
 
 if __name__ == "__main__":
     test_nat_constructor()
@@ -598,3 +613,5 @@ def test_nested_pattern_match():
     test_sum()
     test_tmap()
     test_size()
+    test_compose()
+    test_iterate()
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index 801b3068eff0..773af1f9fe0e 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -2,7 +2,7 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.relay.backend.interpreter import Value, TupleValue
+from tvm.relay.backend.interpreter import Value, TupleValue, TensorValue
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import testing, create_executor
 
@@ -135,6 +135,11 @@ def test_binds():
     tvm.testing.assert_allclose(xx + xx, res)
 
 
+def test_tensor_value():
+    x = relay.var("x", shape=(1, 10))
+    xx = np.ones((1, 10)).astype("float32")
+    check_eval(relay.Function([x], x), [TensorValue(xx)], xx)
+
 def test_kwargs_params():
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -159,3 +164,4 @@ def test_kwargs_params():
     test_binds()
     test_kwargs_params()
     test_ref()
+    test_tensor_value()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index 725b2fbd3c3d..b9e907144785 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -1,9 +1,10 @@
 import tvm
 from tvm import relay
 from tvm.relay.ir_pass import well_formed
+from tvm.relay.prelude import Prelude
 
-def test_well_formed():
-    x = relay.Var('x')
+def test_let():
+    x = relay.Var("x")
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     ty = None
@@ -18,7 +19,7 @@ def test_well_formed():
 
 
 def test_tuple():
-    x = relay.Var('x')
+    x = relay.Var("x")
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     let = relay.Let(x, v, x)
@@ -28,5 +29,23 @@ def test_tuple():
 
 
 def test_tuple_get_item():
-    t = relay.Var('t')
+    t = relay.Var("t")
     assert well_formed(relay.TupleGetItem(t, 2))
+
+
+def test_adt():
+    mod = relay.Module()
+    p = Prelude(mod)
+    x = relay.Var("x")
+    s_case = relay.Clause(relay.PatternConstructor(p.s, [relay.PatternVar(x)]), x)
+    default_case = relay.Clause(relay.PatternVar(x), x)
+    m0 = relay.Match(p.z(), [default_case])
+    m1 = relay.Match(p.z(), [s_case, default_case])
+    assert well_formed(m0)
+    assert not well_formed(m1)
+
+if __name__ == "__main__":
+    test_let()
+    test_tuple()
+    test_tuple_get_item()
+    test_adt()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
new file mode 100644
index 000000000000..a9d91f757407
--- /dev/null
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -0,0 +1,76 @@
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.ir_pass import gradient, infer_type
+from tvm.relay.testing import ctx_list
+
+def sigmoid(x):
+    one = np.ones_like(x)
+    return one / (one + np.exp(-x))
+
+def relu(x):
+    x_copy = np.copy(x)
+    np.maximum(x_copy, 0, x_copy)
+    return x_copy
+
+def test_unary_op():
+    def check_single_op(opfunc, ref):
+        shape = (10, 4)
+        dtype = 'float32'
+        tp = relay.TensorType(shape, dtype)
+        x = relay.var("x", tp)
+        y = opfunc(x)
+
+        if ref is not None:
+            data = np.random.rand(*shape).astype(dtype)
+            ref_grad = ref(data)
+            fwd_func = relay.Function([x], y)
+            bwd_func = infer_type(gradient(fwd_func))
+
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor(ctx=ctx, target=target)
+                op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
+                np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
+
+    for opfunc, ref in [(tvm.relay.log, lambda x: 1 / x),
+                        (tvm.relay.exp, np.exp),
+                        (tvm.relay.sigmoid, lambda x: sigmoid(x) * (1 - sigmoid(x))),
+                        (tvm.relay.tanh, lambda x: 1 - np.tanh(x) * np.tanh(x)),
+                        (tvm.relay.sqrt, lambda x: 0.5 * np.power(x, -0.5)),
+                        (relay.nn.relu, lambda x: np.where(x < 0, np.zeros_like(x), np.ones_like(x)))]:
+        check_single_op(opfunc, ref)
+
+
+def test_binary_op():
+    def inst(vars, sh):
+        return [vars.get(s, s) for s in sh]
+
+    def check_binary_op(opfunc, ref):
+        s = (5, 10, 5)
+        t = relay.TensorType((5, 10, 5))
+        x = relay.var("x", t)
+        y = relay.var("y", t)
+        z = opfunc(x, y)
+
+        x_data = np.random.rand(*s).astype(t.dtype)
+        y_data = np.random.rand(*s).astype(t.dtype)
+        ref_grad0, ref_grad1 = ref(x_data, y_data)
+        fwd_func = relay.Function([x, y], z)
+        bwd_func = infer_type(gradient(fwd_func))
+
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor(ctx=ctx, target=target)
+            op_res, (op_grad0, op_grad1) = intrp.evaluate(bwd_func)(x_data, y_data)
+            np.testing.assert_allclose(op_grad0.asnumpy(), ref_grad0, rtol=0.01)
+            np.testing.assert_allclose(op_grad1.asnumpy(), ref_grad1, rtol=0.01)
+
+    for opfunc, ref in [(relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
+                        (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
+                        (relay.multiply, lambda x, y: [y, x]),
+                        (relay.divide, lambda x, y: [1 / y, - x / (y**2)])]:
+        check_binary_op(opfunc, ref)
+
+
+if __name__ == "__main__":
+    test_unary_op()
+    test_binary_op()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 6a1662b65170..b954e42bf1ab 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -39,11 +39,11 @@ def check_single_op(opfunc, ref):
 
 
     for opfunc, ref in [(tvm.relay.log, np.log),
-                   (tvm.relay.exp, np.exp),
-                   (tvm.relay.sqrt, np.sqrt),
-                   (tvm.relay.sigmoid, sigmoid),
-                   (tvm.relay.tanh, np.tanh),
-                   (relay.nn.relu, relu)]:
+                        (tvm.relay.exp, np.exp),
+                        (tvm.relay.sqrt, np.sqrt),
+                        (tvm.relay.sigmoid, sigmoid),
+                        (tvm.relay.tanh, np.tanh),
+                        (relay.nn.relu, relu)]:
         check_single_op(opfunc, ref)
 
 
@@ -84,9 +84,9 @@ def check_binary_op(opfunc, ref):
                 np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
     for opfunc, ref in [(relay.add, np.add),
-                   (relay.subtract, np.subtract),
-                   (relay.multiply, np.multiply),
-                   (relay.divide, np.divide)]:
+                        (relay.subtract, np.subtract),
+                        (relay.multiply, np.multiply),
+                        (relay.divide, np.divide)]:
         check_binary_op(opfunc, ref)
 
 
@@ -306,7 +306,6 @@ def test_dense():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
-
 if __name__ == "__main__":
     test_concatenate()
     test_bias_add()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index a6e169e23a6c..7237cfbc3b87 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -2,8 +2,11 @@
 """
 import numpy as np
 import tvm
+import topi.testing
 from tvm import relay
 from tvm.relay.testing import ctx_list
+import topi
+import topi.testing
 
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
@@ -126,7 +129,6 @@ def verify_reverse_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reverse_reshape(x, newshape=newshape)
         zz = relay.ir_pass.infer_type(z)
-        print(zz.checked_type)
         assert "newshape=" in z.astext()
         assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
 
@@ -144,8 +146,41 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
+def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
+    x = relay.var("x", relay.TensorType(x_shape, dtype))
+    y = relay.var("y", relay.TensorType(y_shape, dtype))
+    z = relay.nn.batch_matmul(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
+
+    func = relay.Function([x, y], z)
+    x_np = np.random.uniform(size=x_shape).astype(dtype)
+    y_np = np.random.uniform(size=y_shape).astype(dtype)
+    z_np = topi.testing.batch_matmul(x_np, y_np)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            z = intrp.evaluate(func)(x_np, y_np)
+            tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
+
+def test_batch_matmul():
+    b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k")
+    x = relay.var("x", relay.TensorType((b, m, k), "float32"))
+    y = relay.var("y", relay.TensorType((b, n, k), "float32"))
+    z = relay.nn.batch_matmul(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((b, m, n), "float32")
+
+    verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
+    verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
+
+
 if __name__ == "__main__":
     test_collapse_sum_like()
     test_broadcast_to_like()
     test_slice_like()
     test_reverse_reshape()
+    test_batch_matmul()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 550637023d43..e762c7d3a1a0 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -457,6 +457,40 @@ def test_infer_type_prelu():
     verify_infer_type_prelu((1, 3, 2, 2), None, 1, (1, 3, 2, 2))
     verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
 
+
+def test_arange():
+    def verify_arange(start, stop, step):
+        dtype = "float32"
+        if start is None and step is None:
+            x = relay.arange(stop)
+            ref_res = np.arange(stop)
+        elif start is None:
+            x = relay.arange(stop, step=step)
+            ref_res = np.arange(stop, step=step)
+        elif step is None:
+            x = relay.arange(start, stop)
+            ref_res = np.arange(start, stop)
+        else:
+            x = relay.arange(start, stop, step)
+            ref_res = np.arange(start, stop, step)
+
+        func = relay.Function([], x)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)()
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_arange(None, 20, None)
+    verify_arange(None, 20, 2)
+    verify_arange(1, 20, None)
+    verify_arange(1, 20, 2)
+    verify_arange(1, 20, 1.5)
+    verify_arange(1, 20.5, None)
+    verify_arange(1, 20, 3)
+    verify_arange(20, 1, -1)
+    verify_arange(20, 1, -1.5)
+
+
 if __name__ == "__main__":
     test_cast()
     test_zeros_ones()
@@ -480,3 +514,4 @@ def test_infer_type_prelu():
     test_squeeze_infer_type()
     test_squeeze_bad_axes_infer_type()
     test_split_infer_type()
+    test_arange()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 1d91d92a6abc..eceedc760d4b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -7,7 +7,6 @@
 from tvm.relay.testing import ctx_list
 import topi.testing
 
-
 def test_resize_infer_type():
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
@@ -136,56 +135,107 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
-def test_nms():
-    def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
-                   overlap_threshold=0.5, force_suppress=False, topk=-1,
+def test_get_valid_counts():
+    def verify_get_valid_counts(dshape, score_threshold):
+        dtype = "float32"
+        batch_size, num_anchor, elem_length = dshape
+        np_data = np.random.uniform(size=dshape).astype(dtype)
+        np_out1 = np.zeros(shape=(batch_size,))
+        np_out2 = np.zeros(shape=dshape).astype(dtype)
+        for i in range(batch_size):
+            np_out1[i] = 0
+            inter_idx = 0
+            for j in range(num_anchor):
+                score = np_data[i, j, 1]
+                if score >= score_threshold:
+                    for k in range(elem_length):
+                        np_out2[i, inter_idx, k] = np_data[i, j, k]
+                    np_out1[i] += 1
+                    inter_idx += 1
+                if j >= np_out1[i]:
+                    for k in range(elem_length):
+                        np_out2[i, j, k] = -1
+
+        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
+        z = relay.vision.get_valid_counts(x, score_threshold)
+        assert "score_threshold" in z.astext()
+        func = relay.Function([x], z.astuple())
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+            out = intrp.evaluate(func)(np_data)
+            tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3)
+            tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3)
+
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
+
+def test_non_max_suppression():
+    def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
+                   iou_threshold=0.5, force_suppress=False, top_k=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
-        assert "overlap_threshold" in z.astext()
+        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k)
+        assert "iou_threshold" in z.astext()
+        assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
+        zz_indices = relay.ir_pass.infer_type(z_indices)
         assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
+        assert zz_indices.checked_type == relay.ty.TensorType((dshape[0], dshape[1]), "int32")
 
         if check_type_only:
             return
 
         func = relay.Function([x0, x1], z)
         func = relay.ir_pass.infer_type(func)
+        func_indices = relay.Function([x0, x1], z_indices)
+        func_indices = relay.ir_pass.infer_type(func_indices)
         ctx_list = [("llvm", tvm.cpu(0))]
         for target, ctx in ctx_list:
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res1.asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res2.asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               force_suppress=True, topk=2, check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+               force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               force_suppress=True, topk=2, check_type_only=False)
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+               force_suppress=True, top_k=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [1, 0.7, 30, 60, 50, 80], [-1, 0.9, 35, 61, 52, 79],
+                           [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, 1, -1, -1]])
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               topk=3)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, top_k=3)
 
 
 def test_multibox_transform_loc():
@@ -227,7 +277,7 @@ def test_default_value():
 
         assert ret.checked_type == ref_type
 
-        nms = relay.vision.nms(mtl[0], mtl[1])
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = relay.ir_pass.infer_type(func)
         ctx_list = [("llvm", tvm.cpu(0))]
@@ -307,10 +357,114 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
     verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2)
 
 
+def test_proposal():
+    def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
+        cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
+        bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32"))
+        im_info = relay.var("im_info", relay.ty.TensorType(np_im_info.shape, "float32"))
+        z = relay.vision.proposal(cls_prob, bbox_pred, im_info, **attrs)
+        zz = relay.ir_pass.infer_type(z)
+
+        assert zz.checked_type == relay.ty.TensorType(np_out.shape, "float32")
+
+        func = relay.Function([cls_prob, bbox_pred, im_info], z)
+        func = relay.ir_pass.infer_type(func)
+        for target in ['cuda']:
+            if not tvm.module.enabled(target):
+                print("Skip test because %s is not enabled." % target)
+                continue
+            ctx = tvm.context(target, 0)
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), np_out, rtol=1e-4)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), np_out, rtol=1e-4)
+
+    attrs = {
+        'scales': (0.5,),
+        'ratios': (0.5,),
+        'feature_stride': 16,
+        'iou_loss': False,
+        'rpn_min_size': 16,
+        'threshold': 0.7,
+        'rpn_pre_nms_top_n': 200,
+        'rpn_post_nms_top_n': 4,
+    }
+
+    np_cls_prob = np.array([[
+        [[0.3, 0.6, 0.2], [0.4, 0.7, 0.5], [0.1, 0.4, 0.3]],
+        [[0.7, 0.5, 0.3], [0.6, 0.4, 0.8], [0.9, 0.2, 0.5]]
+    ]], dtype='float32')
+    np_bbox_pred = np.array([[
+        [[0.5, 1.0, 0.6], [0.8,  1.2, 2.0], [0.9, 1.0, 0.8]],
+        [[0.5, 1.0, 0.7], [0.8,  1.2, 1.6], [2.1, 1.5, 0.7]],
+        [[1.0, 0.5, 0.7], [1.5,  0.9, 1.6], [1.4, 1.5, 0.8]],
+        [[1.0, 0.5, 0.6], [1.5,  0.9, 2.0], [1.8, 1.0, 0.9]],
+    ]], dtype='float32')
+    np_im_info = np.array([[48., 48., 1.]], dtype='float32')
+    np_out = np.array([
+        [0., 0., 2.8451548,28.38012, 18.154846],
+        [0., 0., 15.354933, 41.96971, 41.245064],
+        [0., 18.019852, 1.0538368, 51.98015, 25.946163],
+        [0., 27.320923, -1.266357, 55., 24.666357]
+    ], dtype='float32')
+
+
+    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+
+    np_out = np.array([
+        [ 0., -5.25, -2.5, 21.75, 19.],
+        [ 0., 11.25, -2., 37.25, 18.5],
+        [ 0., 26.849998, -2.3000002, 53.45, 18.6],
+        [ 0., -4.95, 13.799999, 22.25, 35.5]
+    ], dtype='float32')
+    attrs['iou_loss'] = True
+    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+
+
+def test_yolo_reorg_infer_shape():
+    def verify_yolo_reorg(shape, stride, out_shape):
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.vision.yolo_reorg(x, stride=stride)
+        zz = relay.ir_pass.infer_type(z)
+        assert "stride=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(out_shape, "float32")
+
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    verify_yolo_reorg((n, c, 20, 20), 10, (n, c*10*10, 2, 2))
+    verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, h/2, w/2))
+
+def test_yolo_reorg():
+    def verify_yolo_reorg(shape, stride):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        ref_res = topi.testing.reorg_python(x_data, stride)
+
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.vision.yolo_reorg(x, stride=stride)
+        zz = relay.ir_pass.infer_type(z)
+        assert "stride=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
+
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_yolo_reorg((1, 100, 20, 20), 10)
+    verify_yolo_reorg((1, 4, 6, 6), 2)
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
     test_multibox_prior()
     test_multibox_transform_loc()
-    test_nms()
+    test_get_valid_counts()
     test_roi_align()
+    test_proposal()
+    test_yolo_reorg_infer_shape()
+    test_yolo_reorg()
+    test_non_max_suppression()
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
new file mode 100644
index 000000000000..b398ea8ba2f5
--- /dev/null
+++ b/tests/python/relay/test_param_dict.py
@@ -0,0 +1,78 @@
+import os
+import numpy as np
+import tvm
+import json
+import base64
+from tvm._ffi.base import py_str
+from tvm.relay.op import add
+from tvm import relay
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+
+
+def test_save_load():
+    x = np.ones((10, 2)).astype("float32")
+    y = np.ones((1, 2, 3)).astype("float32")
+    params = {"x": x, "y": y}
+    param_bytes = relay.save_param_dict(params)
+    assert isinstance(param_bytes, bytearray)
+    param2 = relay.load_param_dict(param_bytes)
+    assert len(param2) == 2
+    np.testing.assert_equal(param2["x"].asnumpy(), x)
+    np.testing.assert_equal(param2["y"].asnumpy(), y)
+
+
+def test_ndarray_reflection():
+    # Make two `NDArrayWrapper`s that point to the same underlying array.
+    np_array = np.random.uniform(size=(10, 2)).astype("float32")
+    tvm_array = tvm.nd.array(np_array)
+    param_dict = {'x': tvm_array, 'y': tvm_array}
+    assert param_dict['x'].same_as(param_dict['y'])
+    # Serialize then deserialize `param_dict`.
+    deser_param_dict = relay.load_param_dict(relay.save_param_dict(param_dict))
+    # Make sure the data matches the original data and `x` and `y` contain the same data.
+    np.testing.assert_equal(deser_param_dict['x'].asnumpy(), tvm_array.asnumpy())
+    # Make sure `x` and `y` contain the same data.
+    np.testing.assert_equal(deser_param_dict['x'].asnumpy(), deser_param_dict['y'].asnumpy())
+
+
+def test_bigendian_rpc_param():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+
+    def verify_graph_runtime(remote, target, shape, dtype):
+        x = relay.var('x')
+        y = relay.const(1)
+        z = relay.add(x, y)
+        func = relay.Function([x], z)
+
+        x_in = np.ones(shape).astype(dtype)
+        params = {'x': x_in}
+        graph, lib, params = relay.build(func, target=target, params=params)
+
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        lib.save(path_dso)
+        remote.upload(path_dso)
+        lib = remote.load_module("dev_lib.o")
+        ctx = remote.cpu(0)
+        mod = graph_runtime.create(graph, lib, ctx)
+        mod.load_params(relay.save_param_dict(params))
+        mod.run()
+        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_graph_runtime(remote, target, (10,), dtype)
+
+
+if __name__ == "__main__":
+    test_save_load()
+    test_ndarray_reflection()
+    test_bigendian_rpc_param()
diff --git a/tests/python/relay/test_pass_eliminate_common_subexpr.py b/tests/python/relay/test_pass_eliminate_common_subexpr.py
new file mode 100644
index 000000000000..381a54a3d324
--- /dev/null
+++ b/tests/python/relay/test_pass_eliminate_common_subexpr.py
@@ -0,0 +1,63 @@
+"""Test eliminate common subexpr pass"""
+from tvm import relay
+from tvm.relay.op import register_alter_op_layout
+from tvm.relay import ir_pass
+
+
+def test_simple():
+    def before():
+        x = relay.var("x", shape=(1, 16))
+        y1 = relay.nn.relu(x)
+        y2 = relay.nn.relu(x)
+        y1 = relay.add(y1, relay.const(1.0, "float32"))
+        y2 = relay.add(y2, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def expected():
+        x = relay.var("x", shape=(1, 16))
+        y = relay.nn.relu(x)
+        y = relay.add(y, relay.const(1.0, "float32"))
+        y = relay.add(y, y)
+        f = relay.Function([x], y)
+        return f
+
+    z = before()
+    z = ir_pass.eliminate_common_subexpr(z)
+    assert ir_pass.alpha_equal(z, expected())
+
+
+def test_callback():
+    def before():
+        x = relay.var("x", shape=(1, 16))
+        y1 = relay.nn.relu(x)
+        y2 = relay.nn.relu(x)
+        y1 = relay.add(y1, relay.const(1.0, "float32"))
+        y2 = relay.add(y2, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def expected():
+        x = relay.var("x", shape=(1, 16))
+        y = relay.nn.relu(x)
+        y1 = relay.add(y, relay.const(1.0, "float32"))
+        y2 = relay.add(y, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def fskip(expr):
+        if isinstance(expr, relay.expr.Call) and expr.op.name == 'add':
+            return True
+        return False
+
+    z = before()
+    z = ir_pass.eliminate_common_subexpr(z, fskip)
+    assert ir_pass.alpha_equal(z, expected())
+
+
+if __name__ == "__main__":
+    test_simple()
+    test_callback()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 1d926a325b1a..634d69bae823 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -251,6 +251,42 @@ def expected(dshape):
     assert relay.ir_pass.alpha_equal(z, after)
 
 
+def test_fuse_myia_regression():
+    def before(dshape, dtype):
+        x = relay.var('x', shape=dshape, dtype=dtype)
+        y = relay.var('y', shape=dshape, dtype=dtype)
+        sb = relay.ScopeBuilder()
+        with sb.if_scope(relay.op.greater(x, y)):
+            sb.ret(relay.Function([], x))
+        with sb.else_scope():
+            sb.ret(relay.Function([], y))
+        return relay.Function([x, y],
+            relay.Call(sb.get(), []))
+
+    def expected(dshape, dtype):
+        x = relay.var('x', shape=dshape, dtype=dtype)
+        y = relay.var('y', shape=dshape, dtype=dtype)
+        sb = relay.ScopeBuilder()
+        p1 = relay.var('p1', shape=dshape, dtype=dtype)
+        p2 = relay.var('p2', shape=dshape, dtype=dtype)
+        fused_gt = relay.Function([p1, p2],
+            relay.op.greater(p1, p2))
+        with sb.if_scope(fused_gt(x, y)):
+            sb.ret(relay.Function([], x))
+        with sb.else_scope():
+            sb.ret(relay.Function([], y))
+        return relay.Function([x, y],
+            relay.Call(sb.get(), []))
+
+    dshape = ()
+    dtype = 'int64'
+    f = before(dshape, dtype)
+    f = relay.ir_pass.infer_type(f)
+    f = relay.ir_pass.fuse_ops(f)
+    after = relay.ir_pass.infer_type(expected(dshape, dtype))
+    assert relay.ir_pass.alpha_equal(f, after)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
@@ -258,3 +294,4 @@ def expected(dshape):
     test_tuple_root()
     test_tuple_strided_slice()
     test_stop_fusion()
+    test_fuse_myia_regression()
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 6b5d0e776934..400941f12617 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -2,6 +2,7 @@
 from tvm import relay
 from tvm.relay.ir_pass import free_vars, free_type_vars, gradient
 from tvm.relay import create_executor
+from tvm.relay.prelude import Prelude
 
 import numpy as np
 
@@ -123,6 +124,72 @@ def test_broadcast_subtract():
                                -np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
 
 
+def test_tuple():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    y = relay.var("y", t)
+    z = relay.var("z", t)
+    tup = relay.Var("tup")
+    func = relay.Function([x, y, z], relay.Let(tup, relay.Tuple([x, y, z]),
+                                               relay.TupleGetItem(tup, 0) +
+                                               relay.TupleGetItem(tup, 1) -
+                                               relay.TupleGetItem(tup, 2)))
+    back_func = relay.ir_pass.infer_type(gradient(func))
+    assert back_func.checked_type == relay.FuncType([t, t, t], relay.TupleType([t, relay.TupleType([t, t, t])]))
+    x_nd = rand(dtype, *shape)
+    y_nd = rand(dtype, *shape)
+    z_nd = rand(dtype, *shape)
+    x_np = x_nd.asnumpy()
+    y_np = y_nd.asnumpy()
+    z_np = z_nd.asnumpy()
+    expected_forward = x_np + y_np - z_np
+    ex = create_executor()
+    forward, (grad_x, grad_y, grad_z) = ex.evaluate(back_func)(x_nd, y_nd, z_nd)
+    np.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    np.testing.assert_allclose(grad_x.asnumpy(), np.ones_like(grad_x.asnumpy()))
+    np.testing.assert_allclose(grad_y.asnumpy(), np.ones_like(grad_y.asnumpy()))
+    np.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
+
+
+def test_pow():
+    mod = relay.Module()
+    p = Prelude(mod)
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    double = relay.Function([x], x + x)
+    i = relay.var("i", t)
+    func = relay.Function([i], relay.Call(p.iterate(double, p.s(p.s(p.s(p.z())))), [i]))
+    back_func = relay.ir_pass.infer_type(gradient(func, mod=mod), mod=mod)
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    i_nd = rand(dtype, *shape)
+    ex = create_executor(mod=mod)
+    forward, (grad_i,) = ex.evaluate(back_func)(i_nd)
+    np.testing.assert_allclose(forward.asnumpy(), 8 * i_nd.asnumpy())
+    np.testing.assert_allclose(grad_i.asnumpy(), 8 * np.ones_like(grad_i.asnumpy()))
+
+def test_ref():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    r = relay.Var("r")
+    u = relay.Var("u")
+    body = relay.RefRead(r)
+    body = relay.Let(u, relay.RefWrite(r, relay.RefRead(r) + relay.RefRead(r)), body)
+    body = relay.Let(r, relay.RefCreate(x), body)
+    func = relay.Function([x], body)
+    back_func = relay.ir_pass.infer_type(gradient(func))
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    x_nd = rand(dtype, *shape)
+    ex = create_executor()
+    forward, (grad_x,) = ex.evaluate(back_func)(x_nd)
+    np.testing.assert_allclose(forward.asnumpy(), 2 * x_nd.asnumpy())
+    np.testing.assert_allclose(grad_x.asnumpy(), 2 * np.ones_like(grad_x.asnumpy()))
+
 if __name__ == "__main__":
     test_id()
     test_add()
@@ -130,3 +197,6 @@ def test_broadcast_subtract():
     test_sub()
     test_broadcast_add()
     test_broadcast_subtract()
+    test_tuple()
+    test_pow()
+    test_ref()
diff --git a/tests/python/relay/test_pass_mac_count.py b/tests/python/relay/test_pass_mac_count.py
index 56a0f5490cac..0c0144e246d3 100644
--- a/tests/python/relay/test_pass_mac_count.py
+++ b/tests/python/relay/test_pass_mac_count.py
@@ -1,7 +1,6 @@
 """Unit tests for MAC counter."""
 import tvm
 from tvm import relay
-import sys
 
 def test_gemm():
     n = 512
diff --git a/tests/python/relay/test_pass_quantize.py b/tests/python/relay/test_pass_quantize.py
index 6d65d7b2d9ee..2e2389d16244 100644
--- a/tests/python/relay/test_pass_quantize.py
+++ b/tests/python/relay/test_pass_quantize.py
@@ -75,7 +75,7 @@ def make_qgraph(data, weight):
     graph = relay.create_executor('graph')
     res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
     res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
-    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy())
+    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_to_anf.py b/tests/python/relay/test_to_a_normal_form.py
similarity index 64%
rename from tests/python/relay/test_to_anf.py
rename to tests/python/relay/test_to_a_normal_form.py
index 5da7e38a81f5..392e1769e57d 100644
--- a/tests/python/relay/test_to_anf.py
+++ b/tests/python/relay/test_to_a_normal_form.py
@@ -1,9 +1,10 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.ir_pass import to_anf, alpha_equal, infer_type
+from tvm.relay.ir_pass import to_a_normal_form, alpha_equal, infer_type
 from tvm.relay import op, create_executor
-from tvm.relay.backend.interpreter import Value, TupleValue
+from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
+from tvm.relay.prelude import Prelude
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
@@ -20,7 +21,7 @@ def test_explicit_bound():
     z = op.add(y, y)
     f = relay.Function([], op.add(z, z))
     assert not "let" in f.astext() # assert the values are implicitly bounded
-    anf = to_anf(f)
+    anf = to_a_normal_form(f)
     assert "let" in anf.astext() # assert the values are explicitly bounded
     check_eval(f(), 8.0)
     check_eval(anf(), 8.0)
@@ -34,7 +35,7 @@ def test_order():
     x = relay.const(1)
     val = x + y * z
     check_eval(val, 7.0)
-    anf = infer_type(to_anf(val))
+    anf = infer_type(to_a_normal_form(val))
     a = relay.Var('a', relay.IncompleteType())
     b = relay.Var('b', relay.IncompleteType())
     c = relay.Var('c', relay.IncompleteType())
@@ -53,7 +54,7 @@ def test_order():
 def test_if():
     cond = relay.const(True)
     x = relay.If(cond, relay.const(2), relay.const(3))
-    anf = infer_type(to_anf(x))
+    anf = infer_type(to_a_normal_form(x))
     a = relay.Var('a', relay.IncompleteType())
     b = relay.Var('b', relay.IncompleteType())
     c = relay.Var('c', relay.IncompleteType())
@@ -95,12 +96,62 @@ def test_recursion():
     mod[f] = value
     check_eval(f(relay.const(5, 'int64')), 30.0, mod=mod)
     old_f = mod[f]
-    f = to_anf(f, mod=mod)
+    f = to_a_normal_form(f, mod=mod)
     check_eval(f(relay.const(5, 'int64')), 30.0, mod=mod)
 
 
+def test_ref():
+    i = relay.Var('i')
+    iv = relay.Var('iv')
+    u = relay.Var('u')
+    uv = relay.Var('uv')
+    body = relay.add(iv, uv)
+    body = relay.Let(uv, relay.RefRead(i), body)
+    body = relay.Let(u, relay.RefWrite(i, relay.const(2)), body)
+    body = relay.Let(iv, relay.RefRead(i), body)
+    body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
+    check_eval(body, 3)
+    check_eval(to_a_normal_form(body), 3)
+
+
+# this is an example of using the adt value in python side
+def count(n):
+    assert isinstance(n, ConstructorValue)
+    if n.constructor.name_hint == 's':
+        return 1 + count(n.fields[0])
+    else:
+        assert n.constructor.name_hint == 'z'
+        return 0
+
+
+def test_add():
+    mod = relay.Module()
+    p = Prelude(mod)
+    nat = p.nat
+    add = p.add
+    s = p.s
+    z = p.z
+    ctx = tvm.context("llvm", 0)
+    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
+    assert count(intrp.evaluate(add(s(z()), s(z())))) == 2
+    assert count(intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
+    assert "let" in mod[add].astext()
+
+def test_let():
+    x = relay.Var("x")
+    y = relay.Var("y")
+    d = relay.const(4.0, 'float32')
+    body = relay.Let(y, x, x + y)
+    body = relay.Let(x, d, body)
+    check_eval(body, 8)
+    check_eval(to_a_normal_form(body), 8)
+
 if __name__ == '__main__':
     test_explicit_bound()
     test_order()
     test_if()
     test_recursion()
+    test_ref()
+    test_add()
+    test_let()
diff --git a/tests/python/relay/test_to_graph_normal_form.py b/tests/python/relay/test_to_graph_normal_form.py
new file mode 100644
index 000000000000..ac86799b6b8c
--- /dev/null
+++ b/tests/python/relay/test_to_graph_normal_form.py
@@ -0,0 +1,51 @@
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import to_graph_normal_form, to_a_normal_form, alpha_equal
+from tvm.relay import op, create_executor
+from tvm.relay.backend.interpreter import Value, TupleValue
+
+
+def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
+    if mod is None:
+        mod = relay.Module()
+
+    ctx = tvm.context("llvm", 0)
+    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+
+    result = intrp.evaluate(expr)(*args)
+    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
+
+
+def test_implicit_share():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    z = relay.Var('z')
+    body = relay.Let(z, op.add(y, y), op.add(z, z))
+    body = relay.Let(y, op.add(x, x), body)
+    f = relay.Function([], relay.Let(x, relay.const(1), body))
+    g = to_graph_normal_form(f)
+    assert "let" in f.astext()
+    assert not "let" in g.astext()
+    check_eval(f, [], 8.0)
+    check_eval(g, [], 8.0)
+
+
+def test_round_trip():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    z = relay.Var('z')
+    body = relay.Let(z, op.add(y, y), op.add(z, z))
+    body = relay.Let(y, op.add(x, x), body)
+    f = relay.Function([], relay.Let(x, relay.const(1), body))
+    g = to_graph_normal_form(f)
+    h = to_a_normal_form(g)
+    assert "let" in f.astext()
+    assert not "let" in g.astext()
+    check_eval(f, [], 8.0)
+    check_eval(g, [], 8.0)
+    check_eval(h, [], 8.0)
+
+if __name__ == '__main__':
+    test_implicit_share()
+    test_round_trip()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 05f8b8fd22f9..8c8e7dfd1fcc 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -133,6 +133,58 @@ def test_incomplete_call():
     assert ft.checked_type == relay.FuncType([tt, f_type], tt)
 
 
+def test_higher_order_argument():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    b = relay.TypeVar('b')
+    f = relay.Var('f', relay.FuncType([b], b))
+    y = relay.Var('y', b)
+    ho_func = relay.Function([f, y], f(y), b, [b])
+
+    # id func should be an acceptable argument to the higher-order
+    # function even though id_func takes a type parameter
+    ho_call = ho_func(id_func, relay.const(0, 'int32'))
+
+    hc = relay.ir_pass.infer_type(ho_call)
+    expected = relay.scalar_type('int32')
+    assert hc.checked_type == expected
+
+
+def test_higher_order_return():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    b = relay.TypeVar('b')
+    nested_id = relay.Function([], id_func, relay.FuncType([b], b), [b])
+
+    ft = relay.ir_pass.infer_type(nested_id)
+    assert ft.checked_type == relay.FuncType([], relay.FuncType([b], b), [b])
+
+
+def test_higher_order_nested():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    choice_t = relay.FuncType([], relay.scalar_type('bool'))
+    f = relay.Var('f', choice_t)
+
+    b = relay.TypeVar('b')
+    z = relay.Var('z')
+    top = relay.Function(
+        [f],
+        relay.If(f(), id_func, relay.Function([z], z)),
+        relay.FuncType([b], b),
+        [b])
+
+    expected = relay.FuncType([choice_t], relay.FuncType([b], b), [b])
+    ft = relay.ir_pass.infer_type(top)
+    assert ft.checked_type == expected
+
+
 def test_tuple():
     tp = relay.TensorType((10,))
     x = relay.var("x", tp)
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
new file mode 100644
index 000000000000..968692208f5d
--- /dev/null
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -0,0 +1,219 @@
+import tvm
+
+def test_dtype_bound():
+    analyzer = tvm.arith.Analyzer()
+
+    x = tvm.var("x", dtype="int64")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    x = tvm.var("x", dtype="int8")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == -128
+    assert bd.max_value == 127
+
+    x = tvm.var("x", dtype="uint8")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == 0
+    assert bd.max_value == 255
+
+
+def test_cast_bound():
+    analyzer = tvm.arith.Analyzer()
+    x = tvm.var("x", dtype="int8")
+    bd = analyzer.const_int_bound((x % 3).astype("uint32"))
+    assert bd.min_value == 0
+    assert bd.max_value == 2
+
+    bd = analyzer.const_int_bound(
+        (x % 3).astype("float32").astype("int32"))
+    assert bd.min_value == -2
+    assert bd.max_value == 2
+
+
+def test_add_sub_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(1, 10))
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == 1
+    assert bd.max_value == 14
+
+    bd = analyzer.const_int_bound(x - y)
+    assert bd.min_value == -10
+    assert bd.max_value == 3
+
+    analyzer.update(x, tvm.arith.ConstIntBound(0, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x - y)
+    assert bd.min_value == -10
+    assert bd.max_value == bd.POS_INF
+
+    bd = analyzer.const_int_bound(1 - x)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == 1
+
+
+def test_mul_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-2, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x * y + 20)
+    assert bd.min_value == 0
+    assert bd.max_value == 60
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-3, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-8, 2), override=True)
+    bd = analyzer.const_int_bound(x * y)
+    assert bd.min_value == -32
+    assert bd.max_value == 24
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-8, 2), override=True)
+    bd = analyzer.const_int_bound(x * y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+
+def test_div_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == -2
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-2, 0), override=True)
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == -4
+    assert bd.max_value == 9
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-2, 1), override=True)
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+
+def test_mod_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == -9
+    assert bd.max_value == 4
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == -9
+    assert bd.max_value == 9
+
+    analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == 0
+    assert bd.max_value == 9
+
+
+def test_min_max_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(tvm.min(x, y))
+    assert bd.min_value == -9
+    assert bd.max_value == 10
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(tvm.min(x, y))
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == 10
+
+    bd = analyzer.const_int_bound(tvm.max(x, y))
+    assert bd.min_value == 4
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(tvm.max(x, y))
+    assert bd.min_value == 4
+    assert bd.max_value == bd.POS_INF
+
+
+def test_select_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+
+    bd = analyzer.const_int_bound(
+        tvm.expr.Select(x > 1, (y < 0).astype("int32"), y + 1))
+    assert bd.min_value == 0
+    assert bd.max_value == 11
+
+
+def test_shift_and_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(2, 10))
+
+    bd = analyzer.const_int_bound(x >> y)
+    assert bd.min_value == -3
+    assert bd.max_value == 2
+
+    bd = analyzer.const_int_bound(x & y)
+    assert bd.min_value == 0
+    assert bd.max_value == 10
+
+    analyzer.update(x, tvm.arith.ConstIntBound(10, 11), override=True)
+    bd = analyzer.const_int_bound(x & y)
+    assert bd.min_value == 0
+    assert bd.max_value == 10
+
+
+def test_mix_index_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 24 - 1))
+    analyzer.update(y, tvm.arith.ConstIntBound(0, 3 - 1))
+    bd = analyzer.const_int_bound((x % 8) + (x / 8) * 8)
+    assert bd.min_value == 0
+    assert bd.max_value == 24 - 1
+
+    bd = analyzer.const_int_bound(y + x * 3)
+    assert bd.min_value == 0
+    assert bd.max_value == 24 * 3 - 1
+
+    bd = analyzer.const_int_bound((x % 7) + (x / 7) * 7)
+    assert bd.min_value == 0
+    assert bd.max_value == (23 // 7) * 7 + 6
+
+
+if __name__ == "__main__":
+    test_dtype_bound()
+    test_cast_bound()
+    test_add_sub_bound()
+    test_mul_bound()
+    test_div_bound()
+    test_mod_bound()
+    test_min_max_bound()
+    test_select_bound()
+    test_shift_and_bound()
+    test_mix_index_bound()
diff --git a/tests/python/unittest/test_arith_modular.py b/tests/python/unittest/test_arith_modular.py
deleted file mode 100644
index 58b5d3115d5e..000000000000
--- a/tests/python/unittest/test_arith_modular.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import tvm
-
-def test_basic():
-    a = tvm.var()
-    b = tvm.var()
-    m = tvm.arith.EvalModular(a * 4 + b * 6 + 7)
-    assert m.coeff == 2
-    assert m.base == 1
-
-    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 + 3))
-    assert m.coeff == 4
-    assert m.base == 3
-
-    m = tvm.arith.EvalModular((a * 4 + 1) / (b * 8 + 3))
-    assert m.coeff == 1
-    assert m.base == 0
-
-    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 / 4))
-    assert m.coeff == 2
-    assert m.base == 0
-
-    m = tvm.arith.EvalModular((a * 12 + 1) - (b * 3 * 7  + 2))
-    assert m.coeff == 3
-    assert m.base == 2
-
-
-    m = tvm.arith.EvalModular(a * 12 + tvm.min(b * 3 * 7, 2))
-    assert m.coeff == 1
-    assert m.base == 0
-
-if __name__ == "__main__":
-    test_basic()
diff --git a/tests/python/unittest/test_arith_modular_set.py b/tests/python/unittest/test_arith_modular_set.py
new file mode 100644
index 000000000000..06ae5197b974
--- /dev/null
+++ b/tests/python/unittest/test_arith_modular_set.py
@@ -0,0 +1,128 @@
+import tvm
+
+
+def test_cast():
+    analyzer = tvm.arith.Analyzer()
+    x = tvm.var("x", dtype="int8")
+    m = analyzer.modular_set((x * 3).astype("uint32"))
+    assert m.coeff == 3
+    assert m.base == 0
+    m = analyzer.modular_set(
+        (x * 3 + 1).astype("float32").astype("int32"))
+    assert m.coeff == 3
+    assert m.base == 1
+
+
+def test_add_sub():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    m = analyzer.modular_set(x * 6 + y * 4)
+    assert m.coeff == 2
+    assert m.base == 0
+
+    analyzer.bind(y, x * 4 + 1)
+    m = analyzer.modular_set(1 - y)
+    assert m.coeff == 4
+    assert m.base == 0
+
+
+def test_mul():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    m = analyzer.modular_set((x * 4 + 2) * (y * 6 + 1))
+    assert m.coeff == 4
+    assert m.base == 2
+
+
+def test_div_shift():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    # not sure if x is non-negative
+    m = analyzer.modular_set((x * 4 + 2) / 2)
+    assert m.coeff == 1
+    assert m.base == 0
+    # right shift always round down so it is fine
+    m = analyzer.modular_set((x * 4 + 2) >> 1)
+    assert m.coeff == 2
+    assert m.base == 1
+    # x is non-negative
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 100))
+    m = analyzer.modular_set((x * 4 + 2) / 2)
+    assert m.coeff == 2
+    assert m.base == 1
+
+
+def test_min_max_select():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    m = analyzer.modular_set(tvm.min(x * 3, y * 9))
+    assert m.coeff == 3
+    assert m.base == 0
+
+    m = analyzer.modular_set(tvm.max(x * 3 + 1, y * 9 + 4))
+    assert m.coeff == 3
+    assert m.base == 1
+
+    m = analyzer.modular_set(tvm.expr.Select(x > 0, x * 3 + 1, y * 9 + 2))
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+def test_mix_index():
+    a = tvm.var("a")
+    b = tvm.var("b")
+    analyzer = tvm.arith.Analyzer()
+    m = analyzer.modular_set(a * 4 + b * 6 + 7)
+    assert m.coeff == 2
+    assert m.base == 1
+
+    m = analyzer.modular_set((a * 4 + 1) * (b * 8 + 3))
+    assert m.coeff == 4
+    assert m.base == 3
+
+    m = analyzer.modular_set((a * 4 + 1) / (b * 8 + 3))
+    assert m.coeff == 1
+    assert m.base == 0
+
+    m = analyzer.modular_set((a * 4 + 1) * (b * 8 / 4))
+    assert m.coeff == 2
+    assert m.base == 0
+
+    m = analyzer.modular_set((a * 12 + 1) - (b * 3 * 7  + 2))
+    assert m.coeff == 3
+    assert m.base == 2
+
+    m = analyzer.modular_set(a * 12 + tvm.min(b * 3 * 7, 2))
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+def test_constraint_scope():
+    a = tvm.var("a")
+    b = tvm.var("b")
+    analyzer = tvm.arith.Analyzer()
+    with analyzer.constraint_scope(b % 4 == 2):
+        m = analyzer.modular_set(b + 1)
+        assert m.coeff == 4
+        assert m.base == 3
+        with analyzer.constraint_scope(a % 2 == 1):
+            m = analyzer.modular_set(b + a * 2)
+            assert m.coeff == 4
+            assert m.base == 0
+        m = analyzer.modular_set(b + a * 2)
+        assert m.coeff == 2
+        assert m.base == 0
+
+    m = analyzer.modular_set(b + 1)
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+if __name__ == "__main__":
+    test_cast()
+    test_add_sub()
+    test_mul()
+    test_div_shift()
+    test_min_max_select()
+    test_mix_index()
+    test_constraint_scope()
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
new file mode 100644
index 000000000000..bbfddddd41da
--- /dev/null
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -0,0 +1,252 @@
+import tvm
+
+class RewriteChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, expected):
+        res = self.analyzer.rewrite_simplify(data)
+        assert tvm.ir_pass.Equal(res, expected), "data={}, res={}, expected={}".format(
+            data, res, expected)
+
+
+def test_vector_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    # Add rules
+    ck.verify(tvm.expr.Ramp(x, 1, 4) + tvm.expr.Ramp(y, 2, 4),
+              tvm.expr.Ramp(x + y, 3, 4))
+    ck.verify(tvm.expr.Ramp(x, 1, 2) + y,
+              tvm.expr.Ramp(x + y, 1, 2))
+    ck.verify(y + tvm.expr.Ramp(x, 1, 2) ,
+              tvm.expr.Ramp(y + x, 1, 2))
+    ck.verify(y.astype("int32x2") + x.astype("int32x2"),
+              (y + x).astype("int32x2"))
+    # Sub rules
+    ck.verify(tvm.expr.Ramp(x, 4, 4) - tvm.expr.Ramp(y, 2, 4),
+              tvm.expr.Ramp(x - y, 2, 4))
+    ck.verify(tvm.expr.Ramp(x, 1, 2) - y,
+              tvm.expr.Ramp(x - y, 1, 2))
+    ck.verify(y - tvm.expr.Ramp(x, 1, 2) ,
+              tvm.expr.Ramp(y - x, -1, 2))
+    ck.verify(y.astype("int32x2") - x.astype("int32x2"),
+              (y - x).astype("int32x2"))
+
+    # Mul rules
+    ck.verify(y.astype("int32x2") * x.astype("int32x2"),
+              (y * x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) * 2,
+              tvm.expr.Ramp(x * 2, 8, 4))
+    ck.verify(2 * tvm.expr.Ramp(x, 4, 4),
+              tvm.expr.Ramp(x * 2, 8, 4))
+
+    ## Div rules
+    ck.verify(y.astype("int32x2") / x.astype("int32x2"),
+              (y / x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) / 2,
+              tvm.expr.Ramp(x/ 2, 2, 4))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) / 8,
+              (x).astype("int32x4"))
+    ck.verify(tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8,
+              tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8)
+
+    ## Mod rules
+    ck.verify(y.astype("int32x2") % x.astype("int32x2"),
+              (y % x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) % 2,
+              tvm.expr.Broadcast(x % 2, 4))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) % 8,
+              tvm.expr.Ramp(1, 1, 4))
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 15, 4) % 8,
+              tvm.expr.Ramp(1, 15, 4) % 8)
+
+
+
+def test_select_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    # Add rules
+    ck.verify(tvm.expr.Select(x > 0, y, 0) + tvm.expr.Select(x > 0, 1, z),
+              tvm.expr.Select(x > 0, y + 1, z))
+    ck.verify(tvm.expr.Select(x > 0, y, 1) - tvm.expr.Select(x > 0, 1, z),
+              tvm.expr.Select(x > 0, y + (-1), 1 - z))
+    ck.verify(tvm.expr.Select(x > 0, y, z) - y,
+              tvm.expr.Select(x > 0, 0, z - y))
+    ck.verify(tvm.expr.Select(x > 0, y, z) - z,
+              tvm.expr.Select(x > 0, y - z, 0))
+
+
+def test_add_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+
+    ck.verify(x + (y - x), y)
+    ck.verify(x - (y + 1) + (y + 1), x)
+    ck.verify((x - 10) + (10 - z), x - z)
+    ck.verify((x - y) + (z - x), z - y)
+
+    ck.verify(tvm.min(x, y - z) + z, tvm.min(x + z, y))
+    ck.verify(tvm.min(x - z, y) + z, tvm.min(x, y + z))
+    ck.verify(tvm.max(x, y - 10) + 10, tvm.max(x + 10, y))
+    ck.verify(tvm.max(x - 11, y) + 11, tvm.max(x, y + 11))
+
+    ck.verify(tvm.max(x, y * 2) + tvm.min(x, y * 2), x + y * 2);
+    ck.verify(tvm.min(x, y * 2) + tvm.max(x, y * 2), x + y * 2);
+
+    ck.verify(tvm.max(x, y + 2) + (-2), tvm.max(x + (-2), y));
+    ck.verify(tvm.min(x, y + 2) + (-2), tvm.min(x + (-2), y));
+    ck.verify(tvm.min(x + 2, y + 3) + (-2), tvm.min(x, y + 1));
+
+    ck.verify(x * y + x * 10, x * (y + 10))
+    ck.verify(y * x + x * 10, x * (y + 10))
+    ck.verify(y * x + 10 * x, x * (y + 10))
+    ck.verify(x * y + 10 * x, x * (y + 10))
+
+    ck.verify(y * (x % 8) + 10 * (x % 8), (x % 8) * (y + 10))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify((x / 8) * 8 + x % 8, x)
+
+    # canonicalization
+    ck.verify(x + 2 + 3 + 4 + x, x * 2 + 9);
+    ck.verify(x + 2 + 3 + 4 + x * 3, x * 4 + 9);
+
+    # conservative bound
+    try:
+        ck.analyzer.update(x, tvm.arith.ConstIntBound(-1, 1000), override=True)
+        ck.verify((x / 8) * 8 + x % 8, x)
+        raise RuntimeError("bad")
+    except AssertionError:
+        pass
+
+
+def test_sub_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+
+    ck.verify(x + y - y, x)
+    ck.verify(x + y - x, y)
+    ck.verify(x - (y + x), 0 - y)
+    ck.verify(x - (x + y), 0 - y)
+
+    ck.verify(tvm.min(x, y) - x, tvm.min(0, y - x))
+    ck.verify(tvm.min(x, y) - y, tvm.min(x - y, 0))
+    ck.verify(tvm.max(x, y) - x, tvm.max(0, y - x))
+    ck.verify(tvm.max(x, y) - y, tvm.max(x - y, 0))
+
+    ck.verify(x - tvm.min(x, y), tvm.max(0, x - y))
+    ck.verify(y - tvm.min(x, y), tvm.max(y - x, 0))
+    ck.verify(x - tvm.max(x, y), tvm.min(0, x - y))
+    ck.verify(y - tvm.max(x, y), tvm.min(y - x, 0))
+
+    # mul co-efficient foldng
+    ck.verify(x - x, 0)
+    ck.verify(x * y - x, x * (y + (-1)))
+    ck.verify(x * y - 10 * x, x * (y + (-10)))
+    ck.verify(y * x - x * z, x * (y - z))
+    ck.verify(y * x - z * x, x * (y - z))
+
+    ck.verify(x + 10 - 20, x + (-10))
+
+    # 4-operands pattern
+    ck.verify((x + y) - (x + z), y - z)
+    ck.verify((y + x) - (x + z), y - z)
+    ck.verify((x + y) - (z + x), y - z)
+    ck.verify((y + x) - (z + x), y - z)
+
+    ck.verify(tvm.min(x + y, z) - x, tvm.min(y, z - x))
+    ck.verify(tvm.min(y + x, z) - x, tvm.min(y, z - x))
+    ck.verify(tvm.min(z, x + y) - x, tvm.min(z - x, y))
+    ck.verify(tvm.min(z, y + x) - x, tvm.min(z - x, y))
+
+    ck.verify(x - tvm.min(x + y, z), tvm.max(0 - y, x - z))
+    ck.verify(x - tvm.min(y + x, z), tvm.max(0 - y, x - z))
+    ck.verify(x - tvm.min(z, x + y), tvm.max(x - z, 0 - y))
+    ck.verify(x - tvm.min(z, y + x), tvm.max(x - z, 0 - y))
+
+    ck.verify(tvm.min(x, y) - tvm.min(y, x), 0)
+    ck.verify(tvm.max(x, y) - tvm.max(y, x), 0)
+    ck.verify(tvm.min(x, y) - tvm.min(x + 10, y + 10), -10)
+    ck.verify(tvm.min(x + 10, y + 1) - tvm.min(x, y - 9), 10)
+
+    # div pattern
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(x - (x / 3) * 3, x % 3)
+    ck.verify((x + 5) / 3 - x / 3, (((x + 2) % 3) + 5)/ 3)
+
+
+def test_mul_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.verify((x + 2) * 3, x * 3 + 6)
+    ck.verify((x * 2) * 3, x * 6)
+    ck.verify(tvm.min(x, y) * tvm.max(x, y), x * y)
+    ck.verify(tvm.max(x, y) * tvm.min(x, y), x * y)
+    ck.verify((x - y) * (-2), (y - x) * 2)
+
+
+def test_div_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(z, tvm.arith.ConstIntBound(0, 1000), override=True)
+
+    ck.verify(x / 2 / 3, x / 6)
+    ck.verify((x / 2 + 1) / 3, (x + 2) / 6)
+    ck.verify(x * 2 / 4, x / 2)
+    ck.verify(x * 4 / 2, x * 2)
+
+    ck.verify((x * 4 + y) / 2, x * 2 + y / 2)
+    ck.verify(tvm.min(x * 6, y) / 2, tvm.min(x * 3, y / 2))
+    ck.verify(tvm.max(x * 6, y) / 2, tvm.max(x * 3, y / 2))
+
+    ck.verify((y + x * 4) / 2, y / 2 + x * 2)
+    ck.verify(tvm.min(y, x * 6) / 2, tvm.min(y / 2, x * 3))
+    ck.verify(tvm.max(y, x * 6) / 2, tvm.max(y / 2, x * 3))
+
+    # 3-operands
+    ck.verify((x * 6 + y + z) / 2, x * 3 + (y + z) / 2)
+    ck.verify((x * 6 - y + (y + 3)) / 2, x * 3 + 1)
+    ck.verify((x * 6 + (y + 3) - y) / 2, x * 3 + 1)
+    ck.verify((y + x * 6 + z) / 2, x * 3 + (y + z) / 2)
+    ck.verify((x + 4) / 2, x / 2 + 2)
+
+    ck.verify((x + y) / x, y / x + 1)
+    ck.verify((y + x) / x, y / x + 1)
+    ck.verify(((x + y) + z) / x, (y + z) / x + 1)
+    ck.verify(((y + x) + z) / x, (y + z) / x + 1)
+    ck.verify((y + (x + z)) / x, (y + z) / x + 1)
+    ck.verify((y + (z + x)) / x, (y + z) / x + 1)
+
+    ck.verify((x * y) / y, x)
+    ck.verify((y * x) / y, x)
+
+    ck.verify((x * z + y) / z, x + y / z)
+    ck.verify((z * x + y) / z, x + y / z)
+    ck.verify((y + x * z) / z, y / z + x)
+    ck.verify((y + z * x) / z, y / z + x)
+
+
+def test_mod_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
+
+    ck.verify(x * 10 % 2, 0)
+    ck.verify((x * 10 + y) % 2, y % 2)
+    ck.verify((x + 10) % 2, x % 2)
+    ck.verify((x + y * 10) % 2, x % 2)
+    ck.verify((x* 10 + 1 + y * 2 + 2) % 2, 1)
+
+
+if __name__ == "__main__":
+    test_mod_index_simplify()
+    test_vector_simplify()
+    test_add_index_simplify()
+    test_sub_index_simplify()
+    test_mul_index_simplify()
+    test_div_index_simplify()
+    test_select_simplify()
diff --git a/tests/python/unittest/test_autotvm_flop_calculator.py b/tests/python/unittest/test_autotvm_flop_calculator.py
index 27bd49fe14df..c5c046894f0c 100644
--- a/tests/python/unittest/test_autotvm_flop_calculator.py
+++ b/tests/python/unittest/test_autotvm_flop_calculator.py
@@ -5,11 +5,17 @@
 
 from tvm.autotvm.task.task import compute_flop
 
+def random_dtypes():
+    """Return pair of (input, accumulator) dtypes"""
+    candidates = [("float32", "float32"), ("float16", "float32"), ("int8", "int32")]
+    return candidates[np.random.choice(len(candidates))]
+
 def test_conv():
     for i in range(5):
         N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
-        D = tvm.placeholder((N, CI, H, W))
-        K = tvm.placeholder((CO, CI, KH, KW))
+        (input_dtype, acc_dtype) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+        K = tvm.placeholder((CO, CI, KH, KW), dtype=input_dtype)
 
         KH = min(H, KH)
         KW = min(W, KW)
@@ -22,7 +28,8 @@ def test_conv():
         OW = (W - KW) + 1
 
         C = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:
-        tvm.sum(D[n][ci][h][w] * K[co][ci][h][w], axis=[ci, kh, kw]))
+        tvm.sum(D[n][ci][h][w].astype(acc_dtype) * K[co][ci][h][w].astype(acc_dtype),
+                axis=[ci, kh, kw]))
 
         s = tvm.create_schedule([C.op])
 
@@ -31,15 +38,16 @@ def test_conv():
 def test_pack_gemm():
     for i in range(5):
         N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
-        A = tvm.placeholder((N, L))
-        B = tvm.placeholder((M, L))
+        (input_dtype, acc_dtype) = random_dtypes()
+        A = tvm.placeholder((N, L), dtype=input_dtype)
+        B = tvm.placeholder((M, L), dtype=input_dtype)
         k = tvm.reduce_axis((0, L))
 
         bn = 4
         A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
         B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
         C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
-        tvm.sum(A_pack[i, k, ii] * B_pack[j, k, jj], axis=[k]))
+        tvm.sum(A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k]))
         C = tvm.compute((N, M), lambda i, j: C_pack[i // bn][j // bn][i % bn][j % bn])
 
         s = tvm.create_schedule([C.op])
@@ -48,14 +56,61 @@ def test_pack_gemm():
 def test_outer_dot():
     for i in range(5):
         N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
-        A = tvm.placeholder((N,))
-        B = tvm.placeholder((M,))
+        (input_dtype, acc_dtype) = random_dtypes()
+        A = tvm.placeholder((N,), dtype=input_dtype)
+        B = tvm.placeholder((M,), dtype=input_dtype)
 
-        C = tvm.compute((N, M), lambda i, j: A[i] * B[j])
+        C = tvm.compute((N, M), lambda i, j: A[i].astype(acc_dtype) * B[j].astype(acc_dtype))
 
         s = tvm.create_schedule([C.op])
         assert compute_flop(s) == N * M
 
+def test_max_pool():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        (input_dtype, _) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute(
+            (N, CO, OH, OW),
+            lambda n, co, h, w: tvm.max(D[n][co][h + kh][w + kw], axis=[kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == N * CO * OH * OW * KH * KW
+
+def test_average_pool():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        (input_dtype, acc_dtype) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute(
+            (N, CO, OH, OW),
+            lambda n, co, h, w: tvm.sum(D[n][co][h + kh][w + kw].astype(acc_dtype) / (KW * KH), axis=[kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == 2 * N * CO * OH * OW * KH * KW
+
 def test_move():
     """No float number operation in simple move. So the estimator should raise an error """
     N = 1024
diff --git a/tests/python/unittest/test_codegen_arm.py b/tests/python/unittest/test_codegen_arm.py
index 24240db72b26..049696f95135 100644
--- a/tests/python/unittest/test_codegen_arm.py
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -26,5 +26,49 @@ def check_correct_assembly(type, elements, counts):
     check_correct_assembly('uint32', 2, 2)
     check_correct_assembly('uint64', 2, 3)
 
+def test_vmlal_s16():
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+
+    def check_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K, N), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert (len(matches) == N // 4)
+    check_correct_assembly(4)
+    check_correct_assembly(8)
+    check_correct_assembly(16)
+
+    def check_broadcast_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K,), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k].astype("int32"),
+            axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert len(matches) == N // 4
+    check_broadcast_correct_assembly(8)
+    check_broadcast_correct_assembly(16)
+    check_broadcast_correct_assembly(32)
+    check_broadcast_correct_assembly(64)
+
 if __name__ == "__main__":
     test_popcount()
+    test_vmlal_s16()
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
index 00acbeb88fcf..f6a69c3a7b13 100644
--- a/tests/python/unittest/test_codegen_c_host.py
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -11,10 +11,7 @@ def test_add():
     s = tvm.create_schedule(C.op)
 
     def check_c():
-        f1 = tvm.lower(s, [A, B, C], name="fadd")
-        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
-        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
-        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        mhost = tvm.build(s, [A, B, C], "c", name="fadd")
         temp = util.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 405577b05b3b..5bed58c8f617 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -300,6 +300,7 @@ def test_bind():
     if not tvm.gpu(0).exist:
         print('[Warning] No GPU found! Skip bind test!')
         return
+
     @script
     def vec_add(a, b):
         c = output_tensor((1000, ), 'float32')
@@ -326,23 +327,45 @@ def raw(a, b):
     func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
     run_and_check(func, ins, outs=outs, target='cuda')
 
-    # Test loop binds
+
     @tvm.hybrid.script
-    def goo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in const_range(len_b * 2):
-            if i < len_b:
-                c[i] = a[i] + b[i]
-            else:
-                c[i - len_b] = a[i - len_b] + b[i - len_b]
+    def foo(a):
+        c = output_tensor((a.shape[0],), a.dtype)
+        total = allocate((1,), a.dtype, 'local')
+        len_i = a.shape[0]
+        len_j = a.shape[1]
+        for i in bind('threadIdx.x', len_i):
+            total[0] = 0.
+            for k in const_range(len_j):
+                total[0] += a[i, k]
+            c[i] = total[0]
+    
         return c
-    a = tvm.placeholder((5, ), name='a', dtype='int32')
-    b = [1, 2, 3, 4, 5]
-    c = goo(a, tvm.convert(b))
-    sch = tvm.create_schedule(c.op)
-    func, ins, outs = run_and_check(goo, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
+    
+    a = tvm.placeholder((8, 4), 'float32')
+    c = foo(a)
+    s = tvm.create_schedule(c.op)
+    ir = tvm.lower(s, [a, c], simple_mode=True)
+    assert not isinstance(ir, tvm.stmt.AttrStmt)
+    func, ins, outs = run_and_check(foo, [a], target='cuda')
+    run_and_check(func, ins, outs=outs, target='cuda')
+
+    @tvm.hybrid.script
+    def max_threads(a):
+        b = output_tensor(a.shape, a.dtype)
+        n = a.shape[0]
+        m = max_num_threads(True)
+        for i in bind('threadIdx.x', m):
+            for j in bind('blockIdx.x', ceil_div(n, m)):
+                if i * m + j < n:
+                    b[i * m + j] = a[i * m + j] + a[i * m + j]
+        return b
+
+    a = tvm.placeholder((10000, ), 'float32')
+    with tvm.target.create('cuda'):
+        func, ins, outs = run_and_check(max_threads, [a], target='cuda')
+        run_and_check(func, ins, outs=outs, target='cuda')
+
 
 def test_math_intrin():
     @script
@@ -455,6 +478,7 @@ def share_vec_add(a, b):
 
         a = tvm.placeholder((256, ), dtype='float32', name='a')
         b = tvm.placeholder((256, ), dtype='float32', name='b')
+        c = share_vec_add(a, b)
         func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
         run_and_check(func, ins, outs=outs, target='cuda')
     else:
diff --git a/tests/python/unittest/test_lang_data_layout.py b/tests/python/unittest/test_lang_data_layout.py
new file mode 100644
index 000000000000..73d626e32fa7
--- /dev/null
+++ b/tests/python/unittest/test_lang_data_layout.py
@@ -0,0 +1,65 @@
+"""Test layout and bijective-layout node"""
+
+import tvm
+from topi.util import get_const_tuple
+
+def test_layout():
+    layout = tvm.layout("NCHW16c")
+    assert layout is not None
+    assert isinstance(layout, tvm.tensor.Layout)
+
+    assert layout.factor_of("c") == 16
+    assert layout.factor_of("C") == 16
+    assert layout.factor_of("N") == -1
+
+    assert layout.index_of("N") == 0
+    assert layout.index_of("C") == 1
+    assert layout.index_of("H") == 2
+    assert layout.index_of("W") == 3
+    assert layout.index_of("c") == 4
+    assert layout.index_of("O") == -1
+
+    assert "N" in layout
+    assert "C" in layout
+    assert "H" in layout
+    assert "W" in layout
+    assert "c" in layout
+    assert "O" not in layout
+
+    assert layout[0] == "N"
+    assert layout[1] == "C"
+    assert layout[2] == "H"
+    assert layout[3] == "W"
+    assert layout[4] == "c"
+    assert layout[-1] == "c"
+
+def test_bilayout_convertible():
+    # not convertible
+    assert tvm.bijective_layout("NCHW", "ABCD") is None
+    # convertible
+    assert tvm.bijective_layout("NCHW", "NCHW16c") is not None
+
+def test_bilayout_shape():
+    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+    assert isinstance(bilayout, tvm.tensor.BijectiveLayout)
+
+    dst_shape = bilayout.forward_shape((1, 32, 7, 7))
+    assert get_const_tuple(dst_shape) == (1, 2, 7, 7, 16)
+
+    src_shape = bilayout.backward_shape(dst_shape)
+    assert get_const_tuple(src_shape) == (1, 32, 7, 7)
+
+def test_bilayout_index():
+    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+
+    dst_index = bilayout.forward_index([0, 18, 6, 6])
+    assert get_const_tuple(dst_index) == (0, 1, 6, 6, 2)
+
+    src_index = bilayout.backward_index([0, 1, 6, 6, 2])
+    assert get_const_tuple(src_index) == (0, 18, 6, 6)
+
+if __name__ == "__main__":
+    test_layout()
+    test_bilayout_convertible()
+    test_bilayout_shape()
+    test_bilayout_index()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index b9d8b689cb9e..4bbe6509c40c 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -68,6 +68,9 @@ def check_verify():
         out = mod.get_output(0, tvm.nd.empty((n,)))
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
+        #test individual run
+        mod.run_individual(20, 2, 1)
+
         mod.exit()
         #verify dump root delete after cleanup
         assert(not os.path.exists(directory))
@@ -94,6 +97,7 @@ def check_remote():
         mod.run(x=tvm.nd.array(a, ctx))
         out = tvm.nd.empty((n,), ctx=ctx)
         out = mod.get_output(0, out)
+        mod.run_individual(20, 2, 1)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
     check_verify()
diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py
index ca5836143ef3..259c302eddd8 100644
--- a/tests/python/unittest/test_schedule_tensorize.py
+++ b/tests/python/unittest/test_schedule_tensorize.py
@@ -229,7 +229,85 @@ def intrin_func(ins, outs):
     s = s.normalize()
     tvm.lower(s, [A, B])
 
+# This test asserts that tensorize does not have any effect on
+# TensorComputeOp operations
+def test_tensorize_tensor_compute_op():
+    # an intrinsic called "multivadd" whose definition (pattern)
+    # is a loop of another intrinsic called "vadd"
+    def intrin_multivadd(n):
+        n_a = tvm.var("n_a")
+        Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a])
+
+        n_b = tvm.var("n_b")
+        Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b])
+
+        n_c = tvm.var("n_c")
+        Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c])
+
+        z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd',
+                                                        Ab.access_ptr("w", offset=n_a*i),
+                                                        Bb.access_ptr("r", offset=n_b*i),
+                                                        Cb.access_ptr("r", offset=n_c*i)))
+
+        # replace the pattern with the multivadd call. I need to figure out
+        # how to pass it the right parameters.
+        def intrin_func(ins, outs):
+            return tvm.call_packed("multivadd")
+
+        with tvm.build_config():
+            return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
+
+    def intrin_vadd(n):
+        dtype = 'float32'
+        x = tvm.placeholder((n,), dtype=dtype, name='vx')
+        y = tvm.placeholder((n,), dtype=dtype, name='vy')
+        z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+        s = tvm.create_schedule(z.op)
+
+        def create_buffer(t):
+            return tvm.decl_buffer(t.shape, t.dtype,
+                                   name='W'+t.name,
+                                   offset_factor=16)
+
+        def intrin_func(ins, outs):
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("float32", 'vadd',
+                                    ins[0].access_ptr("r"), ins[1].access_ptr('r'),
+                                    outs[0].access_ptr('wr')))
+            return ib.get()
+
+        with tvm.build_config(offset_factor=16):
+            return tvm.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x),
+                                                                    y: create_buffer(y),
+                                                                    z: create_buffer(z)})
+
+    # cache_read, cache_write
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+
+    vadd = intrin_vadd(factor)
+    C = tvm.compute((M//factor, factor),
+                    lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
+
+    s = tvm.create_schedule(C.op)
+    multivadd = intrin_multivadd(64)
+    s[C].tensorize(C.op.axis[0], multivadd)
+    s = s.normalize()
+    dom_map = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, dom_map)
+    # The loop that we tried to tensorize still exists in the code
+    # That means tensorize didn't work as expected
+    assert isinstance(stmt.body.body.body, tvm.stmt.For)
+    assert stmt.body.body.body.loop_var.name == C.op.axis[0].var.name
+
+
+
 if __name__ == "__main__":
     test_tensorize_vadd()
     test_tensorize_matmul()
     test_tensorize_op()
+    test_tensorize_tensor_compute_op()
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 95f700172ed5..318671b082e6 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -1,17 +1,26 @@
 #!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+cleanup()
+{
+  rm -rf /tmp/$$.*
+}
+trap cleanup 0
+
 echo "Check codestyle of c++ code..."
-make cpplint || exit -1
+make cpplint
 echo "Check codestyle of python code..."
-make pylint || exit -1
+make pylint
 echo "Check codestyle of jni code..."
-make jnilint || exit -1
+make jnilint
 echo "Check documentations of c++ code..."
-make doc 2>log.txt
-(cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+make doc 2>/tmp/$$.log.txt
+
+grep -v -E "ENABLE_PREPROCESSING|unsupported tag" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
 echo "---------Error Log----------"
-cat logclean.txt
+cat /tmp/$$.logclean.txt
 echo "----------------------------"
-(cat logclean.txt|grep warning) && exit -1
-(cat logclean.txt|grep error) && exit -1
-rm logclean.txt
-rm log.txt
+grep -E "warning|error" < /tmp/$$.logclean.txt || true
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index ad1c04ae1327..88007ee94e85 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -93,6 +93,33 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
     return topi::OpName(A, B);                                      \
   }
 
+/*!
+ * \fn logical_and
+ * \brief Compute A && B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(logical_and, { return a && b; });
+TOPI_DEFINE_OP_OVERLOAD(operator&&, logical_and);
+
+/*!
+ * \fn logical_or
+ * \brief Compute A || B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(logical_or, { return a || b; });
+TOPI_DEFINE_OP_OVERLOAD(operator||, logical_or);
 
 /*!
  * \fn add
diff --git a/topi/include/topi/cuda/vision.h b/topi/include/topi/cuda/vision.h
deleted file mode 100644
index 4dd8b7cee15d..000000000000
--- a/topi/include/topi/cuda/vision.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*!
-*  Copyright (c) 2018 by Contributors
-* \file cuda/vision.h
-* \brief CUDA schedule for vision operations
-*/
-#ifndef TOPI_CUDA_VISION_H_
-#define TOPI_CUDA_VISION_H_
-
-#include "tvm/tvm.h"
-#include "tvm/build_module.h"
-#include "topi/tags.h"
-#include "topi/detail/array_utils.h"
-#include "topi/contrib/cublas.h"
-#include "topi/generic/extern.h"
-
-namespace topi {
-using namespace tvm;
-namespace cuda {
-/*!
-* \brief Create a CUDA schedule for region
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the given ops.
-*/
-inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  auto output = outs[0]->op.output(0);
-  auto num_thread = 64;
-
-  auto _schedule_softmax = [&](const Operation& softmax_op) {
-    auto softmax_inputs = softmax_op->InputTensors();
-    auto softmax = softmax_inputs[0];
-    auto max_elem = softmax_inputs[1];
-    auto expsum = softmax_inputs[2];
-
-    auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
-    auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
-
-    s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
-    auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
-    IterVar ko, ki;
-    s[expsum].split(k, num_thread, &ko, &ki);
-    auto ef = s.rfactor(expsum, ki)[0];
-
-    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
-    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
-    s[ef].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
-
-    s[expsum].set_store_predicate(static_cast<Expr>(thread_x) == 0);
-    IterVar tx, xi;
-    s[softmax_op].split_by_nparts(softmax_op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
-    s[softmax_op].bind(tx, thread_x);
-
-    return max_elem->op.as<ComputeOpNode>()->InputTensors()[0];
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_injective(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag == "softmax_output") {
-      auto tensor = _schedule_softmax(op);
-      if (tensor->op->InputTensors().size() > 0) {
-        traverse(tensor->op);
-      }
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  auto k = output->op.as<ComputeOpNode>()->axis[0];
-  IterVar bx, tx;
-  s[output].split(k, num_thread, &bx, &tx);
-  s[output].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
-  s[output].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
-  return s;
-}
-}  // namespace cuda
-}  // namespace topi
-#endif  // TOPI_CUDA_VISION_H_
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 02bc51515159..40dffa09a9bf 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -71,6 +71,23 @@ inline Tensor negative(const Tensor& x,
   }, name, tag);
 }
 
+/*!
+* \brief Creates an operation that returns the logical NOT of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the logical NOT operation
+*/
+inline Tensor logical_not(const Tensor& x,
+                          std::string name = "tensor",
+                          std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    return !x(i);
+  }, name, tag);
+}
+
 /*!
 * \brief Creates an operation that clips each element of a tensor to
 * the interval [a_min, a_max]
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 5f0b758c6424..00c3f999853d 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -450,28 +450,5 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
   return tvm::compute(output_shape, l, name, tag);
 }
 
-using FLayoutIndicesTransform = std::function<Array<Expr>(const Array<Var>& indices)>;
-
-/*!
- * \brief Transform the layout according to the mapping function \p to_src_indices.
- * \param src the source input.
- * \param dst_shape the output shape.
- * \param to_src_indices the mapping function from input index to output index.
- * \param name output tensor name.
- * \param tag output tensor tag.
- * \return A tensor with shape \p dst_shape.
- */
-inline Tensor layout_transform(const Tensor& src,
-                               const Array<Expr>& dst_shape,
-                               const FLayoutIndicesTransform& to_src_indices,
-                               const std::string name = "layout_transform",
-                               const std::string tag = kInjective) {
-  auto src_shape = src->shape;
-  return compute(
-  dst_shape, [&](const Array<Var>& dst_indices) {
-    return src(to_src_indices(dst_indices));
-  }, name, tag);
-}
-
 }  // namespace topi
 #endif  // TOPI_NN_H_
diff --git a/topi/include/topi/nn/batch_matmul.h b/topi/include/topi/nn/batch_matmul.h
new file mode 100644
index 000000000000..968e1b0c697c
--- /dev/null
+++ b/topi/include/topi/nn/batch_matmul.h
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \brief Batch matmul op constructions
+ * \file nn/batch_matmul.h
+ */
+#ifndef TOPI_NN_BATCH_MATMUL_H_
+#define TOPI_NN_BATCH_MATMUL_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Creates an operation that calculates matrix multiplication in batch.
+*
+* \param x Tensor with shape [batch, M, K]
+* \param y Tensor with shape [batch, N, K]
+*
+* \return Tensor with shape [batch, M, N]
+*/
+inline tvm::Tensor batch_matmul(const tvm::Tensor& x,
+                                const tvm::Tensor& y) {
+  CHECK_EQ(x->shape.size(), 3) << "batch_matmul requires 3-D data";
+  CHECK_EQ(y->shape.size(), 3) << "batch_matmul requires 3-D data";
+
+  auto batch = x->shape[0];
+  auto M = x->shape[1];
+  auto K = x->shape[2];
+  auto N = y->shape[1];
+
+  auto k = tvm::reduce_axis(Range(0, K), "k");
+  auto result = tvm::compute(
+      { batch, M, N },
+      [&](Var b, Var i, Var j) {
+        return tvm::sum(x(b, i, k) * y(b, j, k), { k });
+      }, "tensor", "batch_matmul");
+
+  return result;
+}
+
+}  // namespace nn
+}  // namespace topi
+
+#endif  // TOPI_NN_BATCH_MATMUL_H_
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index 6d98a75ec157..4f9bdb61ab70 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -30,7 +30,12 @@ inline Tensor l2_normalize(const Tensor& data,
                            const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
-  CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
+  for (size_t i = 0; i < axis.size(); ++i) {
+    int ax = topi::detail::GetConstInt(axis[i]);
+    CHECK_LT(ax, data->shape.size()) <<
+             "Axis " << ax << " exceeds input data dim " <<
+             data->shape.size();
+  }
   auto input_shape = data->shape;
   Tensor dot_value = topi::power(data, static_cast<float>(2.0));
   Tensor sum_value = topi::sum(dot_value, axis, true);
@@ -39,7 +44,7 @@ inline Tensor l2_normalize(const Tensor& data,
                       topi::sqrt(tvm::compute(expand_sum->shape,
                                               [&](const Array<Var>& i){
                                                 return (max(expand_sum(i), eps));
-                                              }, name = name, tag = tag)));
+                                              }, name, tag)));
 }
 }  // namespace nn
 }  // namespace topi
diff --git a/topi/include/topi/rocm/vision.h b/topi/include/topi/rocm/vision.h
deleted file mode 100644
index 4178a180deb4..000000000000
--- a/topi/include/topi/rocm/vision.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*!
-*  Copyright (c) 2018 by Contributors
-* \file rocm/vision.h
-* \brief rocm schedule for region operation
-*/
-#ifndef TOPI_ROCM_VISION_H_
-#define TOPI_ROCM_VISION_H_
-
-#include "tvm/tvm.h"
-#include "tvm/build_module.h"
-#include "topi/tags.h"
-#include "topi/detail/array_utils.h"
-#include "topi/contrib/rocblas.h"
-#include "topi/generic/extern.h"
-#include "topi/cuda/vision.h"
-
-namespace topi {
-using namespace tvm;
-namespace rocm {
-/*!
-* \brief Create a rocm schedule for region
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the given ops.
-*/
-inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_region(target, outs);
-}
-}  // namespace rocm
-}  // namespace topi
-#endif  // TOPI_ROCM_VISION_H_
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 04759a582a67..06327dac69f4 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -16,6 +16,7 @@
 #include "topi/detail/ravel_unravel.h"
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
+#include "tvm/data_layout.h"
 
 namespace topi {
 using namespace tvm;
@@ -322,6 +323,56 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
     }, name, tag);
 }
 
+/*!
+* \brief Join a sequence of tensors along a new axis.
+*
+* \param inputs The input tensors
+* \param axis The axis along which the tensors will be stacked
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the stack operation
+*/
+inline Tensor stack(const Array<Tensor>& inputs,
+                    int axis = 0,
+                    std::string name = "tensor",
+                    std::string tag = kInjective) {
+  int ndim = static_cast<int>(inputs[0]->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "stack only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  if (axis < 0) {
+    axis += ndim + 1;
+  }
+  CHECK_LT(axis, inputs[0]->shape.size() + 1) <<
+    "axis out of bounds";
+
+  const int stack_size = static_cast<int>(inputs.size());
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i)
+    out_shape.push_back(inputs[0]->shape[i]);
+  out_shape.push_back(stack_size);
+  for (size_t i = static_cast<size_t>(axis); i < static_cast<size_t>(ndim); ++i)
+    out_shape.push_back(inputs[0]->shape[i]);
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      for (size_t i = 0; i < indices.size(); ++i)
+        if (i != static_cast<size_t>(axis))
+          idx.push_back(indices[i]);
+      auto ind = indices[axis];
+      auto ret = inputs[0](idx);
+      for (int i = 0; i < static_cast<int>(inputs.size() - 1); ++i) {
+        ret = tvm::if_then_else(ind == i + 1,
+                                inputs[i + 1](idx),
+                                ret);
+      }
+      return ret;
+    }, name, tag);
+}
+
 /*!
 * \brief Split a tensor into multiple sub-tensors
 *
@@ -668,6 +719,115 @@ inline Tensor where(const Tensor& condition,
   return out;
 }
 
+/*!
+* \brief Creates an operation to repeat elements of an array
+*
+* \param x The input tensor
+* \param repeats The number of repetitions for each element
+* \param axis The axis along which to repeat values (allows
+* negative indices as offsets from the last dimension)
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the repeat operation
+*/
+inline Tensor repeat(const Tensor& x,
+                     int repeats,
+                     int axis,
+                     std::string name = "tensor",
+                     std::string tag = kBroadcast) {
+  int ndim = static_cast<int>(x->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  CHECK(repeats >= 1)
+    << "repeat only accepts `repeats >= 1`"
+    << ", but got repeats = " << repeats;
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis += ndim;
+  }
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+  new_shape.push_back(repeats * x->shape[axis]);
+  for (size_t i = axis + 1; i < x->shape.size(); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+        idx.push_back(indices[i]);
+      }
+      idx.push_back(indices[axis] / repeats);
+      for (size_t i = axis + 1; i < indices.size(); ++i) {
+        idx.push_back(indices[i]);
+      }
+      return x(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief Creates an operation to tile elements of an array
+*
+* \param x The input tensor
+* \param reps The number of times for repeating the tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the tile operation
+*/
+inline Tensor tile(const Tensor& x,
+                   Array<Integer> reps,
+                   std::string name = "tensor",
+                   std::string tag = kBroadcast) {
+  size_t ndim = x->shape.size();
+  size_t rdim = reps.size();
+  size_t tdim = (ndim > rdim) ? ndim : rdim;
+  Array<Expr> data_shape;
+  Array<Expr> reps_shape;
+  Array<Expr> new_shape;
+  if (ndim == rdim) {
+    for (size_t i = 0; i < ndim; ++i) {
+      data_shape.push_back(x->shape[i]);
+      reps_shape.push_back(reps[i]);
+    }
+  } else if (ndim > rdim) {
+    for (size_t i = 0; i < ndim; ++i)
+      data_shape.push_back(x->shape[i]);
+    for (size_t i = 0; i < (ndim - rdim); ++i)
+      reps_shape.push_back(1);
+    for (size_t i = 0; i < rdim; ++i)
+      reps_shape.push_back(reps[i]);
+  } else {
+    for (size_t i = 0; i < (rdim - ndim); ++i)
+      data_shape.push_back(1);
+    for (size_t i = 0; i < ndim; ++i)
+      data_shape.push_back(x->shape[i]);
+    for (size_t i = 0; i < rdim; ++i)
+      reps_shape.push_back(reps[i]);
+  }
+  for (size_t i = 0; i < tdim; ++i)
+    new_shape.push_back(data_shape[i] * reps_shape[i]);
+
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      if (ndim >= rdim) {
+        for (size_t i = 0; i < ndim; ++i)
+          idx.push_back(indices[i] % x->shape[i]);
+      } else {
+        for (size_t i = 0; i < ndim; ++i)
+          idx.push_back(indices[rdim - ndim + i] % x->shape[i]);
+      }
+      return x(idx);
+    }, name, tag);
+}
+
 /*!
 * \brief Gather elements from a n-dimension array.
 *
@@ -868,6 +1028,57 @@ inline Tensor tensordot(const Tensor& A,
   return compute(output_shape, func, name, tag);
 }
 
+inline Tensor arange(const Expr start,
+                     const Expr stop,
+                     const Expr step,
+                     Type dtype,
+                     std::string name = "tensor",
+                     std::string tag = kInjective) {
+  Expr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
+      tvm::cast(tvm::Float(32), stop - start) / step));
+  Array<Expr> shape;
+  return compute({num_elem}, [&](const Array<Var>& indices) {
+    return tvm::cast(dtype, start + step * indices[0]);
+  }, name, tag);
+}
+
+/*!
+ * \brief Transform the layout according to \p src_layout and \p dst_layout
+ * \param src the source input.
+ * \param src_layout the source layout.
+ * \param dst_layout the destination layout.
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor with shape in \p dst_layout
+ */
+inline Tensor layout_transform(const Tensor& src,
+                               const std::string& src_layout,
+                               const std::string& dst_layout,
+                               const std::string name = "layout_transform",
+                               const std::string tag = kInjective) {
+  Layout src_layout_struct = LayoutNode::make(src_layout);
+  Layout dst_layout_struct = LayoutNode::make(dst_layout);
+
+  if (src_layout_struct.Equals(dst_layout_struct)) {
+    return src;
+  }
+
+  CHECK(src_layout_struct.defined() && dst_layout_struct.defined())
+    << "cannot convert from/to undefined layout";
+
+  auto layout_converter = BijectiveLayoutNode::make(src_layout_struct, dst_layout_struct);
+  CHECK(layout_converter.defined())
+    << "cannot convert from " << src_layout << " to " << dst_layout;
+
+  Array<Expr> dst_shape = layout_converter.ForwardShape(src->shape);
+
+  return compute(
+    dst_shape, [&](const Array<Var>& dst_indices) {
+      Array<Expr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
+      Array<Expr> src_indices = layout_converter.BackwardIndex(dst_indices_expr);
+      return src(src_indices);
+  }, name, tag);
+}
 
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/include/topi/vision/yolo/region.h b/topi/include/topi/vision/yolo/region.h
deleted file mode 100644
index 7d303f445ac4..000000000000
--- a/topi/include/topi/vision/yolo/region.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \brief Region op constructions
- * \file vision/yolo/region.h
- */
-#ifndef TOPI_VISION_YOLO_REGION_H_
-#define TOPI_VISION_YOLO_REGION_H_
-
-#include <algorithm>
-#include <string>
-
-#include "topi/detail/constant_utils.h"
-#include "topi/reduction.h"
-#include "topi/tags.h"
-#include "topi/transform.h"
-#include "topi/nn/softmax.h"
-#include "tvm/tvm.h"
-
-
-namespace topi {
-namespace vision {
-namespace yolo {
-using namespace tvm;
-using namespace nn;
-
-/*!
-* \brief region operation
-*
-* \param data The input tensor. Can be any dimension
-* \param num Darknet layer parameter n
-* \param classes number of classes in the yolo model
-* \param coords Darknet layer parameter coords
-* \param background Darknet layer parameter background
-* \param l_softmax if true apply softmax
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the region operation
-*/
-inline Tensor region(const Tensor &data,
-                     int num,
-                     int classes,
-                     int coords,
-                     int background,
-                     int l_softmax,
-                     std::string name = "tensor",
-                     std::string tag = "region_output") {
-  auto input_shape = data->shape;
-  int split_size = classes + coords + 1;
-  Array <Expr> intermediate_shape = {input_shape[0],
-                                     num,
-                                     split_size,
-                                     input_shape[2],
-                                     input_shape[3]};
-  auto data_block = reshape(data, intermediate_shape);
-  Array <Integer> split_indices;
-  for (int i = 1; i < split_size; ++i) {
-    split_indices.push_back(i);
-  }
-  Array <Tensor> split_res = split(data_block, split_indices, 2);
-  split_res.Set(0, sigmoid(split_res[0]));
-  split_res.Set(1, sigmoid(split_res[1]));
-  if (!background) {
-    split_res.Set(coords, sigmoid(split_res[coords]));
-  }
-
-  if (l_softmax) {
-    int offset = coords + static_cast<int>(!background);
-    Array <Tensor> softmax_input(split_res.begin() + offset, split_res.end());
-    auto softmax_output = softmax(concatenate(softmax_input, 2), 2);
-    Array <Tensor> data_block_1(split_res.begin(), split_res.begin() + offset);
-    data_block_1.push_back(softmax_output);
-    split_res = data_block_1;
-  }
-  Tensor out = concatenate(split_res, 2);
-  return reshape(out, input_shape);
-}
-}  // namespace yolo
-}  // namespace vision
-}  // namespace topi
-#endif  // TOPI_VISION_YOLO_REGION_H_
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index cd6810af8177..ffef3ce81b98 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -61,7 +61,7 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    assert layout in ("NCHW", "NHWC"), "only support layouts NCHW and NHWC"
     if dorefa:
         assert layout == "NCHW", "Cannot support dorea with NHWC layout yet"
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index e402d808096a..fe77762b3ce9 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -562,7 +562,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
     data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
     layout = attrs[data_layout_key]
     out_dtype = attrs["out_dtype"]
-    if out_dtype == "" or out_dtype == "same":
+    if out_dtype in ("same", ""):
         out_dtype = tinfos[0].dtype
 
     if layout != 'NCHW':
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 91c2235fcf70..ba577cd944f0 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -14,6 +14,7 @@
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
+from .batch_matmul import schedule_batch_matmul
 from .vision import *
 from . import ssd
 from .ssd import *
diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py
new file mode 100644
index 000000000000..a1fa256028da
--- /dev/null
+++ b/topi/python/topi/cuda/batch_matmul.py
@@ -0,0 +1,89 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""cuda batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+
+
+@generic.schedule_batch_matmul.register(["cuda", "gpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(op):
+        C = op.output(0)
+        A, B = s[C].op.input_tensors
+        _, M, N = get_const_tuple(C.shape)
+        AA = s.cache_read(A, "shared", [C])
+        AL = s.cache_read(AA, "local", [C])
+        BB = s.cache_read(B, "shared", [C])
+        BL = s.cache_read(BB, "local", [C])
+        CC = s.cache_write(C, "local")
+
+        b, y, x = s[C].op.axis
+        y_bn = get_max_power2_factor(M, 64)
+        x_bn = get_max_power2_factor(N, 64)
+        by, y = s[C].split(y, y_bn)
+        bx, x = s[C].split(x, x_bn)
+        y_nthreads = min(y_bn, 8)
+        x_nthreads = min(x_bn, 8)
+        ty, yi = s[C].split(y, nparts=y_nthreads)
+        tx, xi = s[C].split(x, nparts=x_nthreads)
+        thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x")
+        thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y")
+
+        s[C].reorder(b, by, bx, ty, tx, yi, xi)
+        s[C].bind(b, tvm.thread_axis("blockIdx.z"))
+        s[C].bind(by, tvm.thread_axis("blockIdx.y"))
+        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[C].bind(ty, thread_y)
+        s[C].bind(tx, thread_x)
+        s[C].pragma(yi, "auto_unroll_max_step", 16)
+
+        s[CC].compute_at(s[C], tx)
+        _, yi, xi = s[CC].op.axis
+        k, = s[CC].op.reduce_axis
+        ko, ki = s[CC].split(k, 8)
+        s[CC].reorder(ko, ki, yi, xi)
+        s[CC].pragma(ki, "auto_unroll_max_step", 16)
+
+        s[AA].compute_at(s[CC], ko)
+        s[AL].compute_at(s[CC], ki)
+        s[BB].compute_at(s[CC], ko)
+        s[BL].compute_at(s[CC], ki)
+        _, y, k = s[AA].op.axis
+        ty, yi = s[AA].split(y, nparts=y_nthreads)
+        tx, ki = s[AA].split(k, nparts=x_nthreads)
+        s[AA].reorder(ty, tx, yi, ki)
+        s[AA].bind(ty, thread_y)
+        s[AA].bind(tx, thread_x)
+        s[AA].pragma(yi, "auto_unroll_max_step", 16)
+
+        _, x, k = s[BB].op.axis
+        ty, xi = s[BB].split(x, nparts=y_nthreads)
+        tx, ki = s[BB].split(k, nparts=x_nthreads)
+        s[BB].bind(ty, thread_y)
+        s[BB].bind(tx, thread_x)
+        s[BB].reorder(ty, tx, xi, ki)
+        s[BB].pragma(xi, "auto_unroll_max_step", 16)
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            _schedule(op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 2b0f59ab8510..ca456134a6ce 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -92,10 +92,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 2f2d0deab69d..a8d961ae062e 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -370,7 +370,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
     layout = attrs[data_layout_key]
     out_dtype = attrs["out_dtype"]
-    if out_dtype == "" or out_dtype == "same":
+    if out_dtype in ("", "same"):
         out_dtype = tinfos[0].dtype
 
     data, kernel = tinfos[0:2]
@@ -436,7 +436,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
-    elif groups != CI:
+    if groups != CI:
         workload = autotvm.task.args_to_workload(
             [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
             group_conv2d_nchw)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 3cdc02e58aec..5f79de25e835 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -1,10 +1,10 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Non-maximum suppression operator"""
 import math
 import tvm
 
 from tvm import api
-from topi.vision import nms
+from topi.vision import non_max_suppression
 from ..util import get_const_tuple
 
 def sort_ir(data, index, output):
@@ -35,7 +35,7 @@ def sort_ir(data, index, output):
     p_index = ib.buffer_ptr(index)
     p_out = ib.buffer_ptr(output)
     nthread_tx = max_threads
-    nthread_bx = (num_anchors + 1) // 2 // max_threads + 1
+    nthread_bx = num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
@@ -46,10 +46,8 @@ def sort_ir(data, index, output):
 
     with ib.for_range(0, batch, for_type="unroll") as b:
         start = b * num_anchors
-        for i in range(2):
-            bbox_id = tid * 2 + i
-            with ib.if_scope(bbox_id < num_anchors):
-                p_out[start + bbox_id] = bbox_id
+        with ib.if_scope(tid < num_anchors):
+            p_out[start + tid] = tid
         # OddEvenTransposeSort
         with ib.for_range(0, p_index[b]) as k:
             with ib.if_scope(tid < (p_index[b] + 1) // 2):
@@ -183,13 +181,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     return body
 
 
-@nms.register(["cuda", "gpu"])
-def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+@non_max_suppression.register(["cuda", "gpu"])
+def nms_gpu(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
+            topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -197,15 +196,24 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    return_indices : boolean
+        Whether to return box indices in input data.
+
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    invalid_to_bottom : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -218,14 +226,13 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         # An example to use nms
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder(
-            (dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        topk = -1
+        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -265,8 +272,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
                    lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
+                       ins[0], ins[1], ins[2], outs[0], iou_threshold,
+                       force_suppress, topk),
                    dtype="float32",
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    tag="nms")
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 4c5d1a507660..e8c029d4a871 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -96,7 +96,7 @@ def traverse_before_reduce(operator):
         """Internal travserse function"""
         if isinstance(operator, tvm.tensor.PlaceholderOp):
             return
-        elif tag.is_injective(operator.tag):
+        if tag.is_injective(operator.tag):
             sch[operator].compute_inline()
             for tensor in operator.input_tensors:
                 if tensor.op not in scheduled_ops:
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 746be092ebbe..11062824deb0 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -11,7 +11,7 @@
 from topi.vision.ssd import multibox_prior
 from topi.vision.ssd import multibox_detection
 from topi.vision.ssd import multibox_transform_loc
-from ..nms import nms
+from ..nms import non_max_suppression
 
 
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
@@ -437,6 +437,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(
+    out = non_max_suppression(
         inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index abcbdb50074e..e3bc0fb9d547 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -61,24 +61,6 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.cuda.schedule_injective(cpp_target, outs)
 
-@generic.schedule_region.register(["cuda", "gpu"])
-def schedule_region(outs):
-    """Schedule for region operator.
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of region
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for region.
-    """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.cuda.schedule_region(cpp_target, outs)
-
 @generic.schedule_nms.register(["cuda", "gpu"])
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
@@ -180,3 +162,20 @@ def traverse(op):
         scheduled_ops.append(op)
     traverse(outs[0].op)
     return s
+
+@generic.schedule_get_valid_counts.register(["cuda", "gpu"])
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of get_valid_counts
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 8c303e5be182..00b742f24e64 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -410,3 +410,9 @@ def schedule_l2_normalize(outs):
     target = tvm.target.current_target(allow_none=False)
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_batch_matmul(outs):
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index 9a1e06aa30e8..bfd6c55d533a 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -17,23 +17,6 @@ def _default_schedule(outs, auto_inline):
         s[x].fuse(s[x].op.axis)
     return s
 
-@tvm.target.generic_func
-def schedule_shortcut(outs):
-    """Schedule for shortcut
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of shortcut
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
 @tvm.target.generic_func
 def schedule_reorg(outs):
     """Schedule for reorg
@@ -54,13 +37,13 @@ def schedule_reorg(outs):
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
 @tvm.target.generic_func
-def schedule_region(outs):
-    """Schedule for region
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts
 
     Parameters
     ----------
     outs: Array of Tensor
-      The computation graph description of region
+      The computation graph description of nms
       in the format of an array of tensors.
 
     Returns
@@ -68,9 +51,7 @@ def schedule_region(outs):
     s: Schedule
       The computation schedule for the op.
     """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.generic.default_schedule(cpp_target, outs, False)
+    return _default_schedule(outs, False)
 
 @tvm.target.generic_func
 def schedule_nms(outs):
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index e5a4983b455e..554deaad35a3 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -3,7 +3,6 @@
 
 from __future__ import absolute_import as _abs
 
-import warnings
 import tvm
 
 from .. import generic
@@ -40,10 +39,6 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
 @conv2d_alter_layout.register(["intel_graphics"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     import nnvm.symbol as sym
-    if F != sym:
-        warnings.warn("Only support alter layout for intel graphics in NNVM now. "
-                      "This pass is ignored in relay.")
-        return None
 
     copy_inputs = [s for s in inputs]
 
@@ -51,8 +46,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     kernel = tinfos[1]
 
     import ast
-    padding = ast.literal_eval(attrs['padding'])
-    stride = ast.literal_eval(attrs['strides'])
+    padding = ast.literal_eval(str(attrs['padding']))
+    stride = ast.literal_eval(str(attrs['strides']))
 
     wkl = _get_workload(data, kernel, stride, padding, data.dtype)
     oc_bn = 1
@@ -69,7 +64,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     if "target" in new_attrs:
         del new_attrs["target"]
 
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    if F == sym:
+        out = F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    else:
+        out = F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
+
+    return out
 
 @conv2d_NCHWc.register(["intel_graphics"])
 def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index cfb9e566279a..941fec91a6bd 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -17,3 +17,4 @@
 from .local_response_norm import *
 from .bitserial_conv2d import *
 from .l2_normalize import *
+from .batch_matmul import *
diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py
new file mode 100644
index 000000000000..07e363868b05
--- /dev/null
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -0,0 +1,35 @@
+"""Binary Neural Network (BNN) Operators"""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+import tvm
+from ..util import get_const_tuple
+
+
+def batch_matmul(x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        3-D with shape [batch, M, K]
+
+    y : tvm.TEnsor
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        3-D with shape [batch, M, N]
+    """
+    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
+    x_shape = get_const_tuple(x.shape)
+    y_shape = get_const_tuple(y.shape)
+    assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
+    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+    batch, M, K = x.shape
+    N = y.shape[1]
+    k = tvm.reduce_axis((0, K), name='k')
+    return tvm.compute((batch, M, N),
+                       lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
+                       tag='batch_matmul')
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index 545ad2f38ae5..d41a99a04a9d 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -92,14 +92,14 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits
     if layout == 'NCHW':
         return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
                                  pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
                                  pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
     raise ValueError("not support this layout {} yet".format(layout))
 
 def _get_workload(data, kernel, stride, padding, out_dtype, layout):
     """ Get the workload structure. """
-    assert layout == "NCHW" or layout == "NHWC", \
+    assert layout in ("NCHW", "NHWC"), \
         "Only support layouts NCHW and NHWC"
     if layout == "NCHW":
         _, CI, IH, IW = [x.value for x in data.shape]
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 977b80678524..559f132f19c2 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -48,12 +48,11 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
     # default declaration
     if layout == 'NCHW':
         return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 @tvm.target.generic_func
diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py
index 96a04794c680..9440b5c94bda 100644
--- a/topi/python/topi/rocm/__init__.py
+++ b/topi/python/topi/rocm/__init__.py
@@ -4,5 +4,4 @@
 
 from .conv2d import *
 from .dense import *
-from .vision import *
 from .nn import *
diff --git a/topi/python/topi/rocm/vision.py b/topi/python/topi/rocm/vision.py
deleted file mode 100644
index 84ae436e3531..000000000000
--- a/topi/python/topi/rocm/vision.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""Schedule for vision operator"""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import generic
-from .. import cpp
-
-@generic.schedule_region.register(["rocm"])
-def schedule_region(outs):
-    """Schedule for region operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of region
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for region.
-    """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.rocm.schedule_region(cpp_target, outs)
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 5ea9683f72ef..1743de13fd85 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -14,10 +14,10 @@
 from .upsampling_python import upsampling_python
 from .bilinear_resize_python import bilinear_resize_python
 from .reorg_python import reorg_python
-from .region_python import region_python
 from .roi_align_python import roi_align_nchw_python
-from .shortcut_python import shortcut_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
+from .batch_matmul import batch_matmul
+from .slice_axis_python import slice_axis_python
diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py
new file mode 100644
index 000000000000..a7b2f9344f29
--- /dev/null
+++ b/topi/python/topi/testing/batch_matmul.py
@@ -0,0 +1,26 @@
+# pylint: disable=invalid-name
+"""Batch matmul in python"""
+import numpy as np
+
+def batch_matmul(x, y):
+    """batch_matmul operator implemented in numpy.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        3-D with shape [batch, M, K]
+
+    y : numpy.ndarray
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    out : numpy.ndarray
+        3-D with shape [batch, M, N]
+    """
+    batch, M, _ = x.shape
+    N = y.shape[1]
+    out = np.zeros((batch, M, N)).astype(x.dtype)
+    for i in range(batch):
+        out[i] = np.dot(x[i], y[i].T)
+    return out
diff --git a/topi/python/topi/testing/region_python.py b/topi/python/topi/testing/region_python.py
deleted file mode 100644
index 3bab53892607..000000000000
--- a/topi/python/topi/testing/region_python.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Region in python"""
-import numpy as np
-
-def entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
-
-def region_python(a_np, N, classes, coords, background, softmax):
-    """Region operator
-    Parameters
-    ----------
-    a_np : numpy.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    N : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    coords : int
-        Darknet layer parameter coords
-
-    background : int
-        Darknet layer parameter background
-
-    softmax : int
-        Darknet layer parameter softmax
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    batch, in_channel, in_height, in_width = a_np.shape
-    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
-    outputs = batch*in_channel*in_height*in_width
-    b_np = np.zeros(batch*in_channel*in_height*in_width)
-    for i in range(batch*in_channel*in_height*in_width):
-        b_np[i] = a_np_temp[i]
-    for b in range(batch):
-        for n in range(N):
-            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, 0)
-            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
-            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, coords)
-            if not background:
-                b_np[index: index+in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+in_width*in_height]))
-
-    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
-    def local_softmax(data_in):
-        data_c, data_h, data_w = data_in.shape
-        largest = np.max(data_in, axis=1)
-        data_out = np.zeros((data_c, data_h, data_w))
-        for i in range(data_h):
-            for j in range(data_w):
-                data_out[:, i, j] = np.exp(data_in[:, i, j] - largest[i, j])
-        return data_out/data_out.sum(axis=0)
-
-    if softmax:
-        index = coords + int(not background)
-        for b in range(batch):
-            for i in range(N):
-                b_np_index = int(i*(in_channel/N) + index)
-                b_np[b, b_np_index: b_np_index + classes+background, :, :] = local_softmax(b_np[b, b_np_index:b_np_index + classes+background, :, :])
-
-    return b_np
diff --git a/topi/python/topi/testing/shortcut_python.py b/topi/python/topi/testing/shortcut_python.py
deleted file mode 100644
index 575c28b61c2c..000000000000
--- a/topi/python/topi/testing/shortcut_python.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Shortcut in python"""
-import numpy as np
-
-def shortcut_python(a_np1, a_np2):
-    """Reorg operator
-
-    Parameters
-    ----------
-    a_np1 : numpy.ndarray
-        4-D with shape [batch1, in_channel1, in_height1, in_width1]
-
-    a_np2 : numpy.ndarray
-        4-D with shape [batch2, in_channel2, in_height2, in_width2]
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch1, out_channel1, out_height1, out_width1]
-    """
-
-    batch1, in_channel1, in_height1, in_width1 = a_np1.shape
-    batch2, in_channel2, in_height2, in_width2 = a_np2.shape
-    a_np1_temp = np.reshape(a_np1, batch1*in_channel1*in_height1*in_width1)
-    a_np2_temp = np.reshape(a_np2, batch2*in_channel2*in_height2*in_width2)
-    b_np = np.zeros(batch1*in_channel1*in_height1*in_width1)
-    stride = int(in_width1/in_width2)
-    sample = int(in_width2/in_width1)
-    if stride < 1:
-        stride = 1
-    if sample < 1:
-        sample = 1
-    minw = min(in_width1, in_width2)
-    minh = min(in_height1, in_height2)
-    minc = min(in_channel1, in_channel2)
-
-    for i in range((batch1*in_channel1*in_height1*in_width1)):
-        b_np[i] = a_np1_temp[i]
-    for b in range(batch1):
-        for k in range(minc):
-            for j in range(minh):
-                for i in range(minw):
-                    out_index = i*sample + in_width2*(j*sample + in_height2*(k + in_channel2*b))
-                    add_index = i*stride + in_width1*(j*stride + in_height1*(k + in_channel1*b))
-                    b_np[out_index] = a_np1_temp[out_index] + a_np2_temp[add_index]
-    b_np = np.reshape(b_np, (batch1, in_channel1, in_height1, in_width1))
-    return b_np
diff --git a/topi/python/topi/testing/slice_axis_python.py b/topi/python/topi/testing/slice_axis_python.py
new file mode 100644
index 000000000000..589e5914a36c
--- /dev/null
+++ b/topi/python/topi/testing/slice_axis_python.py
@@ -0,0 +1,34 @@
+"""Slice axis in python"""
+
+def slice_axis_python(data, axis, begin, end=None):
+    """Slice input array along specific axis.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        The source array to be sliced.
+
+    axis : int
+        Axis to be sliced.
+
+    begin: int
+        The index to begin with in the slicing.
+
+    end: int, optional
+        The index indicating end of the slice.
+
+    Returns
+    -------
+    ret : numpy.ndarray
+        The computed result.
+    """
+    dshape = data.shape
+    if axis < 0:
+        axis += len(dshape)
+    if begin < 0:
+        begin += dshape[axis]
+    if end <= 0:
+        end += dshape[axis]
+    slc = [slice(None)] * len(dshape)
+    slc[axis] = slice(begin, end)
+    return data[tuple(slc)]
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
index fc4ad652f900..341dd8f6ceb0 100644
--- a/topi/python/topi/testing/upsampling_python.py
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -17,12 +17,11 @@ def upsampling_python(data, scale, layout='NCHW'):
             for c in range(oshape[1]):
                 output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale)
         return output_np
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         oshape = (ishape[0], ishape[1]*scale, ishape[1]*scale, ishape[3])
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[3]):
                 output_np[b, :, :, c] = upsample_nearest(data[b, :, :, c], scale)
         return output_np
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index b9a7bd4f2992..063556852d26 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -191,6 +191,25 @@ def concatenate(a_tuple, axis=0):
     return cpp.concatenate(a_tuple, axis)
 
 
+def stack(a, axis):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be stacked.
+
+    axis : int, optional
+        The axis in the result array along which the input arrays are stacked.
+
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.stack(a, axis)
+
+
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
 
@@ -289,3 +308,88 @@ def tensordot(a, b, axes):
     if isinstance(axes[0], int):
         return cpp.tensordot(a, b, (axes[0],), (axes[1],))
     return cpp.tensordot(a, b, axes[0], axes[1])
+
+
+def arange(start, stop=None, step=1, dtype="float32"):
+    """Creates a tensor with evenly spaced values within a given interval.
+
+    Parameters
+    ----------
+    start : tvm.Expr, optional
+        Start of interval. The interval includes this value. The default start
+        value is 0.
+
+    stop : tvm.Expr
+        Stop of interval. The interval does not include this value.
+
+    step : tvm.Expr, optional
+        Spacing between values. The default step size is 1.
+
+    dtype : str, optional
+        The target data type.
+
+    Returns
+    -------
+    result : tvm.Tensor
+        The resulting tensor.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return cpp.arange(start, stop, step, dtype)
+
+
+def repeat(a, repeats, axis):
+    """Repeats elements of an array.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be repeated.
+
+    repeats: int, required
+        Number of repetitions for each element
+
+    axis: int, optional
+        The axis along which to repeat values
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.repeat(a, repeats, axis)
+
+
+def tile(a, reps):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be tiled.
+
+    reps: tuple of ints, required
+        The number of times for repeating the tensor
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.tile(a, reps)
+
+
+def layout_transform(array, src_layout, dst_layout):
+    """Transform the layout according to src_layout and dst_layout
+
+    Parameters
+    ----------
+    array : tvm.Tensor
+        The source array.
+
+    src_layout : str
+        the source layout.
+
+    dst_layout : str
+        the destination layout.
+    """
+    return cpp.layout_transform(array, src_layout, dst_layout)
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 6d7326580f6d..d630628b4379 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -255,3 +255,29 @@ def select_array(i, j):
         return now
 
     return tvm.compute(matrix.shape, select_array, name=name)
+
+
+def get_max_power2_factor(n, max_value=None):
+    """Get max factor of n in power of 2. If max_value is specificed, max factor
+    value will be no more max_value,
+
+    Parameter
+    ---------
+    n : int
+        The input value
+
+    max_value : int, optional
+        The max value for the factor
+
+    Returns
+    -------
+    factor : int
+        The max factor in power of 2.
+    """
+    x = 1
+    while n % 2 == 0:
+        if max_value is not None and max_value < x * 2:
+            break
+        x *= 2
+        n /= 2
+    return x
diff --git a/topi/python/topi/vision/__init__.py b/topi/python/topi/vision/__init__.py
index e3aa847972ac..c10f7c68bf36 100644
--- a/topi/python/topi/vision/__init__.py
+++ b/topi/python/topi/vision/__init__.py
@@ -2,8 +2,7 @@
 """VISION network operators"""
 from __future__ import absolute_import as _abs
 
-from . import yolo, ssd
-from .shortcut import *
+from . import ssd
 from .reorg import *
 from .nms import *
 from .rcnn import *
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index a41ee5b50089..169daea2d4d3 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -1,118 +1,247 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
 """Non-maximum suppression operator"""
 import tvm
 
-from tvm import api
+from tvm import api, hybrid
 
-def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
-    """Low level IR routing for transform location in multibox_detection operator.
+@hybrid.script
+def hybrid_rearrange_out(data):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
 
     Parameters
     ----------
-    data: Buffer
-        Buffer of output boxes with class and score.
+    data : tvm.Tensor or numpy NDArray
+        NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
 
-    sort_result : Buffer
-        Buffer of output box indexes sorted by score.
+    Returns
+    -------
+    output : tvm.Tensor or numpy NDArray
+        Transformed NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            elem_length),
+                           data.dtype)
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    for i in parallel(batch_size):
+        valid_idx = 0
+        for j in range(num_anchors):
+            if data[i, j, 0] >= 0:
+                for k in range(elem_length):
+                    output[i, valid_idx, k] = data[i, j, k]
+                valid_idx += 1
+            if j >= valid_idx:
+                for k in range(elem_length):
+                    output[i, j, k] = -1.0
+    return output
 
-    out : Buffer
-        Output buffer.
 
-    nms_threshold : float
-        Non-maximum suppression threshold.
+@hybrid.script
+def hybrid_get_valid_counts(data, score_threshold):
+    """Hybrid routine to get valid count of bounding boxes
+    given a score threshold. Also moves valid boxes to the
+    top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor or numpy NDArray
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : tvm.const
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : tvm.Tensor or numpy NDArray
+        Rearranged data tensor.
+
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    valid_count = output_tensor((batch_size,), "int32")
+    out_tensor = output_tensor((batch_size,
+                                num_anchors,
+                                box_data_length),
+                               data.dtype)
+    for i in parallel(batch_size):
+        valid_count[i] = 0
+        for j in range(num_anchors):
+            score = data[i, j, 1]
+            if score > score_threshold:
+                for k in range(box_data_length):
+                    out_tensor[i, valid_count[i], k] = data[i, j, k]
+                valid_count[i] += 1
+            if j >= valid_count[i]:
+                for k in range(box_data_length):
+                    out_tensor[i, j, k] = -1.0
+    return valid_count, out_tensor
+
+@tvm.target.generic_func
+def get_valid_counts(data, score_threshold=0):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : tvm.Tensor
+        Rearranged data tensor.
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+    """
+    score_threshold_const = tvm.const(score_threshold, "float")
+    return hybrid_get_valid_counts(data, score_threshold_const)
+
+
+@hybrid.script
+def hybrid_nms(data, sorted_index, valid_count,
+               max_output_size, iou_threshold, force_suppress,
+               top_k, id_index):
+    """Hybrid routing for non-maximum suppression.
+
+    Parameters
+    ----------
+    data: tvm.Tensor or numpy NDArray
+        Bounding boxes with class and score. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    sorted_index : tvm.Tensor or numpy NDArray
+        Bounding box indexes sorted by score, with shape
+        [batch_size, num_anchors].
+
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
 
-    force_suppress : boolean
+    max_output_size : tvm.const
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : tvm.const
+        Overlapping(IoU) threshold to suppress object with smaller score.
+
+    force_suppress : tvm.const
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    top_k : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : tvm.const
+        index of the class categories, -1 to disable.
+
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    output : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    box_indices: tvm.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
     """
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes.
-        """
-        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
-        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
-        i = w * h
-        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
-            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
-            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
-            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
-        return tvm.expr.Select(u <= 0.0, 0.0, i / u)
-
-    ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_sort_result = ib.buffer_ptr(sort_result)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    batch_size = out.shape[0]
-    num_anchors = out.shape[1]
-
-    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
-    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
-    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
-                                 p_valid_count[0] > 0)):
-            # Reorder output
-            nkeep = tvm.if_then_else(
-                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
-                nms_topk, p_valid_count[n])
-            with ib.for_range(0, nkeep, name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[(n * num_anchors * 6
-                                                   + p_sort_result[n * num_anchors + l] * 6 + m)]
-            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
-                with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l:
-                    with ib.for_range(0, 6, name="m") as m:
-                        p_out[(n * num_anchors * 6
-                               + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6
-                                                                 + (l + nkeep) * 6 + m)]
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    box_indices = output_tensor((batch_size, num_anchors), "int32")
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            box_data_length,),
+                           data.dtype)
+
+    for i in parallel(batch_size):
+        if iou_threshold > 0:
+            if valid_count[i] > 0:
+                # Reorder output
+                nkeep = valid_count[i]
+                if 0 < top_k < nkeep:
+                    nkeep = top_k
+                for j in range(nkeep):
+                    for k in range(box_data_length):
+                        output[i, j, k] = data[i, sorted_index[i, j], k]
+                    box_indices[i, j] = sorted_index[i, j]
+                if 0 < top_k < valid_count[i]:
+                    for j in range(valid_count[i] - nkeep):
+                        for k in range(box_data_length):
+                            output[i, j + nkeep, k] = -1.0
+                        box_indices[i, j + nkeep] = -1
             # Apply nms
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                offset_l = l * 6
-                with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0):
-                    with ib.for_range(0, p_valid_count[n], name="m") as m:
-                        offset_m = m * 6
-                        with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6
-                                                              + offset_m] >= 0)):
-                            with ib.if_scope(tvm.any(force_suppress_node > 0,
-                                                     p_out[n * num_anchors * 6 + offset_l] ==
-                                                     p_out[n * num_anchors * 6 + offset_m])):
-                                # When force_suppress == True or class_id equals
-                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2,
-                                                        n * num_anchors * 6 + offset_m + 2)
-                                with ib.if_scope(iou >= nms_threshold):
-                                    p_out[n * num_anchors * 6 + offset_m] = -1.0
-        with ib.else_scope():
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m]
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    for k in range(valid_count[i]):
+                        check_iou = 0
+                        if k > j and output[i, k, 0] >= 0:
+                            if force_suppress:
+                                check_iou = 1
+                            elif id_index < 0 or output[i, j, 0] == output[i, k, 0]:
+                                check_iou = 1
+                        if check_iou > 0:
+                            batch_idx = i
+                            box_a_idx = j
+                            box_b_idx = k
+                            box_start_idx = 2
+                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
+                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
+                            a_l = output[batch_idx, box_a_idx, box_start_idx]
+                            a_r = output[batch_idx, box_a_idx, box_start_idx + 2]
+                            b_t = output[batch_idx, box_b_idx, box_start_idx + 1]
+                            b_b = output[batch_idx, box_b_idx, box_start_idx + 3]
+                            b_l = output[batch_idx, box_b_idx, box_start_idx]
+                            b_r = output[batch_idx, box_b_idx, box_start_idx + 2]
+                            w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
+                            h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
+                            area = h * w
+                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+                            iou = 0.0 if u <= 0.0 else area / u
+                            if iou >= iou_threshold:
+                                output[i, k, 0] = -1.0
+                                box_indices[i, k] = -1
+        else:
+            for j in range(valid_count[i]):
+                for k in range(box_data_length):
+                    output[i, j, k] = data[i, j, k]
+                box_indices[i, j] = j
         # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l:
-            with ib.for_range(0, 6, name="m") as m:
-                p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
-    return ib.get()
+        for j in range(num_anchors - valid_count[i]):
+            for k in range(box_data_length):
+                output[i, j + valid_count[i], k] = -1.0
+            box_indices[i, j + valid_count[i]] = -1
+        # Only return max_output_size valid boxes
+        num_valid_boxes = 0
+        if max_output_size > 0:
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    if num_valid_boxes == max_output_size:
+                        for k in range(box_data_length):
+                            output[i, j, k] = -1.0
+                        box_indices[i, j] = -1
+                    else:
+                        num_valid_boxes += 1
+    return output, box_indices
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+def non_max_suppression(data, valid_count, max_output_size=-1,
+                        iou_threshold=0.5, force_suppress=False, top_k=-1,
+                        id_index=0, return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -120,15 +249,28 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    max_output_size : optional, int
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    return_indices : optional, boolean
+        Whether to return box indices in input data.
+
+    invalid_to_bottom : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -138,16 +280,17 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     --------
     .. code-block:: python
 
-        # An example to use nms
+        # An example to use non_max_suppression
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
         valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        top_k = -1
+        out = non_max_suppression(data, valid_count, iou_threshold=iou_threshold,
+                                  force_suppress=force_suppress, top_k=top_k)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -161,7 +304,6 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     score_axis = 1
     score_shape = (batch_size, num_anchors)
     score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
@@ -180,13 +322,13 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
-    out = \
-        tvm.extern(data.shape,
-                   [data, sort_tensor, valid_count],
-                   lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
-                   dtype="float32",
-                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
-                   tag="nms")
-    return out
+    out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
+                                  tvm.const(max_output_size, dtype="int32"),
+                                  tvm.const(iou_threshold, dtype="float32"),
+                                  tvm.const(force_suppress, dtype="bool"),
+                                  tvm.const(top_k, dtype="int32"),
+                                  tvm.const(id_index, dtype="int32"))
+    if not return_indices and invalid_to_bottom:
+        out = hybrid_rearrange_out(out)
+
+    return box_indices if return_indices else out
diff --git a/topi/python/topi/vision/shortcut.py b/topi/python/topi/vision/shortcut.py
deleted file mode 100644
index 529360190a4e..000000000000
--- a/topi/python/topi/vision/shortcut.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""Shortcut operators (short-cut connections)."""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import util
-from .. import transform
-
-@tvm.target.generic_func
-def shortcut(inp1, inp2):
-    """Shortcut forward operators.
-
-    Parameters
-    ----------
-    First Input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Second Input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Returns
-    -------
-    Output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    _, inp1_c, inp1_h, inp1_w = util.get_const_tuple(inp1.shape)
-    batch, inp2_c, inp2_h, inp2_w = util.get_const_tuple(inp2.shape)
-
-    stride = int(max(inp2_w / inp1_w, 1))
-    sample = int(max(inp1_w / inp2_w, 1))
-    minc = min(inp2_c, inp1_c)
-    minh = min(inp2_h, inp1_h)
-    minw = min(inp2_w, inp1_w)
-
-    out = tvm.compute((batch, minc, minh, minw), lambda b, c, h, w:
-                      inp1[b, c, h * sample, w * sample] +
-                      inp2[b, c, h * stride, w * stride],
-                      tag="shortcut")
-
-    split_indices = int(inp1_c / minc)
-    if split_indices > 1:
-        split_res = transform.split(inp1, split_indices, 1)
-        split_res[0] = out
-        out = transform.concatenate(split_res, 1)
-
-    return out
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index f1de42430dd6..2de1723dbd7b 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -1,75 +1,76 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
-import math
 import tvm
 
-from tvm import api
+from tvm import hybrid
+from tvm.intrin import exp, sqrt
 
 import topi
 
-from ..nms import nms
+from ..nms import non_max_suppression
 
-def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
-    """Low level IR routing for multibox_prior operator.
+@hybrid.script
+def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
+    """Hybrid routing for multibox_prior operator.
 
     Parameters
     ----------
-    data : Buffer
-        Input data buffer.
+    data : tvm.Tensor or numpy NDArray
+        4-D tensor with shape [batch, channel, height, width]]
 
-    out : Buffer
-        Output buffer.
+    sizes : tvm ConsExpr
+        Sizes for anchor boxes.
 
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
+    ratios : tvm ConsExpr
+        Ratios for anchor boxes.
 
-    steps : Tuple of float
+    steps : tvm ConsExpr
         Priorbox step across y and x, -1 for auto calculation.
 
-    offsets : tuple of int
+    offsets : tvm ConsExpr
         Priorbox center offsets, y and x respectively.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    output : tvm.Tensor or numpy NDArray
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    ib = tvm.ir_builder.create()
-    p_out = ib.buffer_ptr(out)
     in_height = data.shape[2]
     in_width = data.shape[3]
     num_sizes = len(sizes)
     num_ratios = len(ratios)
-    size_ratio_concat = sizes + ratios
-    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
+    output = output_tensor((1, num_boxes, 4), "float32")
+    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
     offset_h = offsets[0]
     offset_w = offsets[1]
 
-    with ib.for_range(0, in_height, for_type="parallel", name="i") as i:
+    # Need to define var out of const_range + if
+    w = 0.0
+    h = 0.0
+
+    for i in parallel(in_height):
         center_h = (i + offset_h) * steps_h
-        with ib.for_range(0, in_width, name="j") as j:
+        for j in range(in_width):
             center_w = (j + offset_w) * steps_w
-            for k in range(num_sizes + num_ratios - 1):
-                w = tvm.if_then_else(k < num_sizes,
-                                     size_ratio_concat[k] * in_height / in_width / 2.0,
-                                     size_ratio_concat[0] * in_height / in_width *
-                                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                h = tvm.if_then_else(
-                    k < num_sizes, size_ratio_concat[k] / 2.0,
-                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                count = (i * in_width * (num_sizes + num_ratios - 1) +
-                         j * (num_sizes + num_ratios - 1) + k) * 4
-                p_out[count] = center_w - w
-                p_out[count + 1] = center_h - h
-                p_out[count + 2] = center_w + w
-                p_out[count + 3] = center_h + h
-
-    return ib.get()
+            for k in const_range(num_sizes + num_ratios - 1):
+                if k < num_sizes:
+                    w = sizes[k] * in_height / in_width / 2.0
+                    h = sizes[k] / 2.0
+                else:
+                    w = sizes[0] * in_height / in_width \
+                        * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
+                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) \
+                        + j * (num_sizes + num_ratios - 1) + k
+                output[0, count, 0] = center_w - w
+                output[0, count, 1] = center_h - h
+                output[0, count, 2] = center_w + w
+                output[0, count, 3] = center_h + h
+
+    return output
 
 
 @tvm.target.generic_func
@@ -101,115 +102,120 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
     out : tvm.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
-    out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
-                     tag="multibox_prior")
+    out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios),
+                                tvm.convert(steps), tvm.convert(offsets))
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
 
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+@hybrid.script
+def _hybridy_transform_loc(box, pred_loc, variance, clip):
+    """Transform prior anchor box to output box through location predictions.
+    """
+    al = box[0]
+    at = box[1]
+    ar = box[2]
+    ab = box[3]
+
+    px = pred_loc[0]
+    py = pred_loc[1]
+    pw = pred_loc[2]
+    ph = pred_loc[3]
+
+    vx = variance[0]
+    vy = variance[1]
+    vw = variance[2]
+    vh = variance[3]
+
+    output = output_tensor((4,), pred_loc.dtype)
+
+    aw = ar - al
+    ah = ab - at
+    ax = (al + ar) / 2.0
+    ay = (at + ab) / 2.0
+    ox = px * vx * aw + ax
+    oy = py * vy * ah + ay
+    ow = exp(pw * vw) * aw / 2.0
+    oh = exp(ph * vh) * ah / 2.0
+    output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow
+    output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh
+    output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow
+    output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh
+    return output
+
+@hybrid.script
+def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                  clip, threshold, variances):
+    """Hybrid routing for transform location in multibox_detection operator.
 
     Parameters
     ----------
-    cls_prob : Buffer
-        Buffer of class probabilities.
+    cls_prob : tvm.Tensor or numpy NDArray
+        3-D tensor of class probabilities.
 
-    loc_pred : Buffer
-        Buffer of location regression predictions.
+    loc_pred : tvm.Tensor or numpy NDArray
+        2-D tensor of location regression predictions.
 
-    anchor : Buffer
-        Buffer of prior anchor boxes.
+    anchor : tvm.Tensor or numpy NDArray
+        3-D tensor of prior anchor boxes.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
-
-    out : Buffer
-        Output buffer.
-
-    clip : boolean
+    clip : tvm.const
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
+    threshold : tvm.const
         Threshold to be a positive prediction.
 
-    variances : tuple of float
+    variances : tvm.ndarray
         Variances to be decoded from box regression output.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
-        """Transform prior anchor box to output box through location predictions.
-        """
-        al = anchor[anchor_base_idx]
-        at = anchor[anchor_base_idx + 1]
-        ar = anchor[anchor_base_idx + 2]
-        ab = anchor[anchor_base_idx + 3]
-        aw = ar - al
-        ah = ab - at
-        ax = (al + ar) / 2.0
-        ay = (at + ab) / 2.0
-        px = loc[loc_base_idx]
-        py = loc[loc_base_idx + 1]
-        pw = loc[loc_base_idx + 2]
-        ph = loc[loc_base_idx + 3]
-        ox = px * vx * aw + ax
-        oy = py * vy * ah + ay
-        ow = tvm.exp(pw * vw) * aw / 2.0
-        oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
+    out_loc : tvm.Tensor or numpy NDArray
+        3-D tensor of transformed location.
 
+    valid_count : tvm.Tensor or numpy NDArray
+        1_d tensor of valid counts for boxes.
+    """
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
-
-    ib = tvm.ir_builder.create()
-    p_cls_prob = ib.buffer_ptr(cls_prob)
-    p_loc_pred = ib.buffer_ptr(loc_pred)
-    p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        p_valid_count[n] = 0
-        with ib.for_range(0, num_anchors, name="i") as i:
+    box_coord = allocate((4,), loc_pred.dtype)
+    pred_coord = allocate((4,), loc_pred.dtype)
+    out_loc = output_tensor((batch_size, num_anchors, 6),
+                            loc_pred.dtype)
+    valid_count = output_tensor((batch_size,), "int32")
+
+    for i in parallel(batch_size):
+        valid_count[i] = 0
+        for j in range(num_anchors):
             # Find the predicted class id and probability
-            score = ib.allocate('float32', (1,), name="score", scope="local")
-            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
-            score[0] = -1.0
-            cls_id[0] = 0
-            with ib.for_range(0, num_classes, name="j") as j:
-                with ib.if_scope(j > 0):
-                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
-                    cls_id[0] = tvm.if_then_else(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.max(temp, score[0])
-            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
-                cls_id[0] = 0
+            score = -1.0
+            cls_id = 0
+            for k in range(num_classes):
+                if k > 0:
+                    temp = cls_prob[i, k, j]
+                    cls_id = k if temp > score else cls_id
+                    score = max(temp, score)
+            if cls_id > 0 and score < threshold:
+                cls_id = 0
             # [id, prob, xmin, ymin, xmax, ymax]
             # Remove background, restore original id
-            with ib.if_scope(cls_id[0] > 0):
-                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
-                p_out[out_base_idx] = cls_id[0] - 1.0
-                p_out[out_base_idx + 1] = score[0]
-                offset = i * 4
-                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
-                                                        p_anchor, offset, clip, variances[0],
-                                                        variances[1], variances[2], variances[3])
-                p_valid_count[n] += 1
-
-    return ib.get()
-
+            if cls_id > 0:
+                out_loc[i, valid_count[i], 0] = cls_id - 1.0
+                out_loc[i, valid_count[i], 1] = score
+                for l in range(4):
+                    box_coord[l] = anchor[0, j, l]
+                    pred_coord[l] = loc_pred[i, j * 4 + l]
+                out_coord = _hybridy_transform_loc(box_coord, pred_coord,
+                                                   variances, clip)
+                out_loc[i, valid_count[i], 2] = out_coord[0]
+                out_loc[i, valid_count[i], 3] = out_coord[1]
+                out_loc[i, valid_count[i], 4] = out_coord[2]
+                out_loc[i, valid_count[i], 5] = out_coord[3]
+                valid_count[i] += 1
+
+    return out_loc, valid_count
 
 @tvm.target.generic_func
 def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
@@ -240,24 +246,10 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     -------
     ret : tuple of tvm.Tensor
     """
-    batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
-    oshape = (batch_size, num_anchors, 6)
-    # Define data alignment for intermediate buffer
-    valid_count_dtype = "int32"
-    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
-                   lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
-                   tag="multibox_transform_loc")
-    return [out, valid_count]
-
+    return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                         tvm.const(clip, "bool"),
+                                         tvm.const(threshold, "float32"),
+                                         tvm.convert(variances))
 
 @tvm.target.generic_func
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
@@ -300,5 +292,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], -1,
+                              nms_threshold, force_suppress, nms_topk,
+                              return_indices=False)
     return out
diff --git a/topi/python/topi/vision/yolo/__init__.py b/topi/python/topi/vision/yolo/__init__.py
deleted file mode 100644
index c0e9899a41aa..000000000000
--- a/topi/python/topi/vision/yolo/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from .region import *
diff --git a/topi/python/topi/vision/yolo/region.py b/topi/python/topi/vision/yolo/region.py
deleted file mode 100644
index 77c1c86a8d06..000000000000
--- a/topi/python/topi/vision/yolo/region.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""
-REGION Operator
-====================
-Region operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-import tvm
-from ... import cpp
-
-@tvm.target.generic_func
-def region(data, num, classes, coords, background, softmax=True):
-    """Region forward operators.
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-
-    num : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    coords : int
-        Darknet layer parameter coords
-
-    background : int
-        Darknet layer parameter background
-
-    softmax : boolean
-        Darknet layer parameter softmax
-
-    Returns
-    -------
-    out : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-    """
-    return cpp.yolo.region(data, num, classes, coords, background, softmax)
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
new file mode 100644
index 000000000000..37890e389366
--- /dev/null
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -0,0 +1,53 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+
+
+@generic.schedule_batch_matmul.register(["cpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            C = op.output(0)
+            A, B = s[C].op.input_tensors
+            _, M, N = get_const_tuple(C.shape)
+            k, = s[C].op.reduce_axis
+            ko, ki = s[C].split(k, 16)
+            CC = s.rfactor(C, ki)
+
+            b, y, x = s[C].op.axis
+            y_bn = get_max_power2_factor(M, 8)
+            x_bn = get_max_power2_factor(N, 8)
+            yo, yi = s[C].split(y, y_bn)
+            xo, xi = s[C].split(x, x_bn)
+            s[C].reorder(b, yo, xo, yi, xi)
+            bxyo = s[C].fuse(b, yo, xo)
+            s[C].parallel(bxyo)
+            s[C].fuse(yi, xi)
+
+            s[CC].compute_at(s[C], bxyo)
+            _, _, y, x = s[CC].op.axis
+            s[CC].fuse(y, x)
+            s[CC].vectorize(s[CC].op.axis[0])
+            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 0b864c383ca4..327f15a49e07 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -59,7 +59,7 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    assert layout in ("NCHW", "NHWC"), "only support layouts NCHW and NHWC"
 
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
     sch = _get_schedule(wkl, layout)
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index eefa5fec80df..7bad04ddcd46 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -71,12 +71,11 @@ def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out
             _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
         return _declaration_conv_impl(cfg, data, kernel, strides,
                                       padding, dilation, layout, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
@@ -295,6 +294,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
+    out_dtype = attrs["out_dtype"]
 
     layout_name = 'layout' if F == sym else 'data_layout'
 
@@ -302,7 +302,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     kh, kw = attrs.get_int_tuple("kernel_size")
 
     dtype = data.dtype
-    out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
+    out_dtype = dtype if out_dtype in ("same", "") else out_dtype
     is_depthwise = groups == in_channel and groups == out_channel
 
     # only optimize for NCHW
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
new file mode 100644
index 000000000000..33575b4c399d
--- /dev/null
+++ b/topi/python/topi/x86/dense.py
@@ -0,0 +1,208 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 dense operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
+
+from .util import get_fp32_len
+from .. import generic, tag, nn
+from ..util import traverse_inline, get_const_tuple
+
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
+def _declaration_dense(cfg, data, weight, bias=None):
+    batch, _ = get_const_tuple(data.shape)
+
+    # For small batch sizes, don't pack weight into cache-friendly layout
+    # because of overhead in packing and limited reuse from batch dimension
+    # TODO(icemelon9): use a more systematic way to determine which schedule to use
+    if batch <= 16:
+        return _declaration_dense_nopack(cfg, data, weight, bias)
+    return _declaration_dense_pack(cfg, data, weight, bias)
+
+
+# Declare dense compute with packing weight into cache-friendly layout
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
+def _declaration_dense_pack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_y", batch, num_outputs=3)
+    cfg.define_split("tile_x", out_dim, num_outputs=3)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
+
+    packw_bn = cfg["tile_x"].size[-1]
+    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
+    packw = tvm.compute(packw_shape,
+                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+    k = tvm.reduce_axis((0, in_dim), name="k")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(
+                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
+                        axis=k),
+                    tag="dense_pack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+    return C
+
+
+# Declare dense compute without packing weight
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
+def _declaration_dense_nopack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_x", out_dim, num_outputs=2)
+    cfg.define_split("tile_y", batch, num_outputs=2)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
+
+    vec = cfg["tile_k"].size[-1]
+    k = tvm.reduce_axis((0, in_dim // vec), "k")
+    CC = tvm.compute((batch, out_dim, vec),
+                     lambda z, y, x: tvm.sum(
+                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
+
+    kk = tvm.reduce_axis((0, vec), "kk")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
+                    tag="dense_nopack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+
+    return C
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
+def _schedule_dense(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+        elif 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
+def _schedule_dense_pack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
+def _schedule_dense_nopack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_dense_pack_template(cfg, s, C):
+    A, packedB = s[C].op.input_tensors
+
+    CC = s.cache_write(C, "global")
+    y, x = s[C].op.axis
+    k, = s[CC].op.reduce_axis
+
+    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
+    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yt, xt, yo, xo, yi, xi)
+    xyt = s[C].fuse(yt, xt)
+    s[C].parallel(xyt)
+    xyo = s[C].fuse(yo, xo)
+    s[C].unroll(yi)
+    s[C].vectorize(xi)
+
+    s[CC].compute_at(s[C], xyo)
+    y, x = s[CC].op.axis
+    ko, ki = cfg["tile_k"].apply(s, CC, k)
+    s[CC].reorder(ko, ki, y, x)
+    s[CC].vectorize(x)
+    s[CC].unroll(y)
+    s[CC].unroll(ki)
+
+    z, y, x = s[packedB].op.axis
+    s[packedB].reorder(z, x, y)
+    s[packedB].parallel(z)
+    s[packedB].vectorize(y)
+    return s
+
+
+def _schedule_dense_nopack_template(cfg, s, C):
+    y, x = s[C].op.axis
+    kk, = s[C].op.reduce_axis
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yo, xo, yi, xi)
+    xyo = s[C].fuse(yo, xo)
+    s[C].parallel(xyo)
+    s[C].unroll(kk)
+
+    CC, = s[C].op.input_tensors
+    s[CC].compute_at(s[C], xyo)
+    z, y, x = s[CC].op.axis
+    k, = s[CC].op.reduce_axis
+    yz = s[CC].fuse(z, y)
+    s[CC].reorder(k, yz, x)
+    s[CC].unroll(yz)
+    s[CC].vectorize(x)
+    return s
+
+
+def _default_dense_pack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+
+    tilex_ii = 1
+    for bn in range(vec_width*2, 0, -1):
+        if N % bn == 0:
+            tilex_ii = bn
+            break
+    NN = N // tilex_ii
+    tilex_oi = 1
+    while NN // tilex_oi > 4:
+        if (NN // tilex_oi) % 2 == 1:
+            break
+        tilex_oi *= 2
+
+    tiley_ii = 8
+    while M % tiley_ii != 0:
+        tiley_ii //= 2
+    MM = M // tiley_ii
+    tiley_oi = 1
+    while MM // tiley_oi > 4:
+        if (MM // tiley_oi) % 2 == 1:
+            break
+        tiley_oi *= 2
+
+    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
+    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
+    cfg["tile_k"] = SplitEntity([K, 1])
+
+
+def _default_dense_nopack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+    tilek_bn = 1
+    for bn in range(vec_width*2, 0, -1):
+        if K % bn == 0:
+            tilek_bn = bn
+            break
+    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
+    cfg["tile_x"] = SplitEntity([N, 1])
+    cfg["tile_y"] = SplitEntity([1, M])
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index ab6dda40cc9d..73463242e96d 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,12 +2,7 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity
-
-from .util import get_fp32_len
-from .. import generic, tag, nn
-from ..util import traverse_inline, get_const_tuple
+from .. import generic
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -37,205 +32,3 @@ def schedule_softmax(outs):
     else:
         s[x].parallel(s[x].op.axis[0])
     return s
-
-
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
-def _declaration_dense(cfg, data, weight, bias=None):
-    batch, _ = get_const_tuple(data.shape)
-
-    # For small batch sizes, don't pack weight into cache-friendly layout
-    # because of overhead in packing and limited reuse from batch dimension
-    # TODO(icemelon9): use a more systematic way to determine which schedule to use
-    if batch <= 16:
-        return _declaration_dense_nopack(cfg, data, weight, bias)
-    return _declaration_dense_pack(cfg, data, weight, bias)
-
-
-# Declare dense compute with packing weight into cache-friendly layout
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
-def _declaration_dense_pack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_y", batch, num_outputs=3)
-    cfg.define_split("tile_x", out_dim, num_outputs=3)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
-
-    packw_bn = cfg["tile_x"].size[-1]
-    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
-    packw = tvm.compute(packw_shape,
-                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
-    k = tvm.reduce_axis((0, in_dim), name="k")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(
-                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
-                        axis=k),
-                    tag="dense_pack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-    return C
-
-
-# Declare dense compute without packing weight
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
-def _declaration_dense_nopack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_x", out_dim, num_outputs=2)
-    cfg.define_split("tile_y", batch, num_outputs=2)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
-
-    vec = cfg["tile_k"].size[-1]
-    k = tvm.reduce_axis((0, in_dim // vec), "k")
-    CC = tvm.compute((batch, out_dim, vec),
-                     lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
-
-    kk = tvm.reduce_axis((0, vec), "kk")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
-                    tag="dense_nopack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-
-    return C
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
-def _schedule_dense(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-        elif 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
-def _schedule_dense_pack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
-def _schedule_dense_nopack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_pack_template(cfg, s, C):
-    A, packedB = s[C].op.input_tensors
-
-    CC = s.cache_write(C, "global")
-    y, x = s[C].op.axis
-    k, = s[CC].op.reduce_axis
-
-    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
-    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yt, xt, yo, xo, yi, xi)
-    xyt = s[C].fuse(yt, xt)
-    s[C].parallel(xyt)
-    xyo = s[C].fuse(yo, xo)
-    s[C].unroll(yi)
-    s[C].vectorize(xi)
-
-    s[CC].compute_at(s[C], xyo)
-    y, x = s[CC].op.axis
-    ko, ki = cfg["tile_k"].apply(s, CC, k)
-    s[CC].reorder(ko, ki, y, x)
-    s[CC].vectorize(x)
-    s[CC].unroll(y)
-    s[CC].unroll(ki)
-
-    z, y, x = s[packedB].op.axis
-    s[packedB].reorder(z, x, y)
-    s[packedB].parallel(z)
-    s[packedB].vectorize(y)
-    return s
-
-
-def _schedule_dense_nopack_template(cfg, s, C):
-    y, x = s[C].op.axis
-    kk, = s[C].op.reduce_axis
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yo, xo, yi, xi)
-    xyo = s[C].fuse(yo, xo)
-    s[C].parallel(xyo)
-    s[C].unroll(kk)
-
-    CC, = s[C].op.input_tensors
-    s[CC].compute_at(s[C], xyo)
-    z, y, x = s[CC].op.axis
-    k, = s[CC].op.reduce_axis
-    yz = s[CC].fuse(z, y)
-    s[CC].reorder(k, yz, x)
-    s[CC].unroll(yz)
-    s[CC].vectorize(x)
-    return s
-
-
-def _default_dense_pack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-
-    tilex_ii = 1
-    for bn in range(vec_width*2, 0, -1):
-        if N % bn == 0:
-            tilex_ii = bn
-            break
-    NN = N // tilex_ii
-    tilex_oi = 1
-    while NN // tilex_oi > 4:
-        if (NN // tilex_oi) % 2 == 1:
-            break
-        tilex_oi *= 2
-
-    tiley_ii = 8
-    while M % tiley_ii != 0:
-        tiley_ii //= 2
-    MM = M // tiley_ii
-    tiley_oi = 1
-    while MM // tiley_oi > 4:
-        if (MM // tiley_oi) % 2 == 1:
-            break
-        tiley_oi *= 2
-
-    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
-    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
-    cfg["tile_k"] = SplitEntity([K, 1])
-
-
-def _default_dense_nopack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-    tilek_bn = 1
-    for bn in range(vec_width*2, 0, -1):
-        if K % bn == 0:
-            tilek_bn = bn
-            break
-    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
-    cfg["tile_x"] = SplitEntity([N, 1])
-    cfg["tile_y"] = SplitEntity([1, M])
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index d56174fda5c5..14f92460fd25 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -27,10 +27,10 @@
 #include <topi/nn/upsampling.h>
 #include <topi/nn/l2_normalize.h>
 #include <topi/nn/local_response_norm.h>
+#include <topi/nn/batch_matmul.h>
 
 #include <topi/vision/reorg.h>
 #include <topi/image/resize.h>
-#include <topi/vision/yolo/region.h>
 #include <topi/generic/default.h>
 #include <topi/generic/extern.h>
 #include <topi/generic/injective.h>
@@ -41,7 +41,6 @@
 #include <topi/cuda/pooling.h>
 #include <topi/cuda/reduction.h>
 #include <topi/cuda/softmax.h>
-#include <topi/cuda/vision.h>
 #include <topi/cuda/normalization.h>
 
 #include <topi/x86/bnn.h>
@@ -49,7 +48,6 @@
 #include <topi/x86/injective.h>
 
 #include <topi/rocm/dense.h>
-#include <topi/rocm/vision.h>
 #include <topi/rocm/normalization.h>
 
 namespace topi {
@@ -114,6 +112,8 @@ TOPI_REGISTER_BCAST_OP("topi.maximum", topi::maximum);
 TOPI_REGISTER_BCAST_OP("topi.minimum", topi::minimum);
 TOPI_REGISTER_BCAST_OP("topi.power", topi::power);
 TOPI_REGISTER_BCAST_OP("topi.left_shift", topi::left_shift);
+TOPI_REGISTER_BCAST_OP("topi.logical_and", topi::logical_and);
+TOPI_REGISTER_BCAST_OP("topi.logical_or", topi::logical_or);
 TOPI_REGISTER_BCAST_OP("topi.right_shift", topi::right_shift);
 TOPI_REGISTER_BCAST_OP("topi.greater", topi::greater);
 TOPI_REGISTER_BCAST_OP("topi.less", topi::less);
@@ -266,6 +266,11 @@ TVM_REGISTER_GLOBAL("topi.concatenate")
   *rv = concatenate(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.stack")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = stack(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.split")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   if (args[1].type_code() == kDLInt || args[1].type_code() == kDLUInt) {
@@ -275,6 +280,11 @@ TVM_REGISTER_GLOBAL("topi.split")
   }
   });
 
+TVM_REGISTER_GLOBAL("topi.layout_transform")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = layout_transform(args[0], args[1], args[2]);
+});
+
 TVM_REGISTER_GLOBAL("topi.take")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   if (args.size() == 2) {
@@ -290,6 +300,21 @@ TVM_REGISTER_GLOBAL("topi.where")
   *rv = where(args[0], args[1], args[2]);
 });
 
+TVM_REGISTER_GLOBAL("topi.arange")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = arange(args[0], args[1], args[2], args[3]);
+});
+
+TVM_REGISTER_GLOBAL("topi.repeat")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = repeat(args[0], args[1], args[2]);
+});
+
+TVM_REGISTER_GLOBAL("topi.tile")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = tile(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.gather_nd")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = gather_nd(args[0], args[1]);
@@ -344,6 +369,12 @@ TVM_REGISTER_GLOBAL("topi.nn.dense")
   *rv = nn::dense(args[0], args[1], args[2]);
   });
 
+/* Ops from nn/batch_matmul.h */
+TVM_REGISTER_GLOBAL("topi.nn.batch_matmul")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::batch_matmul(args[0], args[1]);
+  });
+
 /* Ops from nn/dilate.h */
 TVM_REGISTER_GLOBAL("topi.nn.dilate")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -411,11 +442,6 @@ TVM_REGISTER_GLOBAL("topi.vision.reorg")
   *rv = vision::reorg(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.vision.yolo.region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = vision::yolo::region(args[0], args[1], args[2], args[3], args[4], args[5]);
-  });
-
 /* Ops from image/resize.h */
 TVM_REGISTER_GLOBAL("topi.image.bilinear_sample_nchw")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -483,11 +509,6 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense")
   *rv = topi::rocm::schedule_dense(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::rocm::schedule_region(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::rocm::schedule_lrn(args[0], args[1]);
@@ -539,11 +560,6 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax")
   *rv = topi::cuda::schedule_softmax(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_region(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_lrn(args[0], args[1]);
@@ -597,6 +613,9 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense)
 .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense))
 .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense));
 
+TVM_REGISTER_GENERIC_FUNC(schedule_batch_matmul)
+.set_default(WrapSchedule(topi::generic::default_schedule));
+
 TVM_REGISTER_GENERIC_FUNC(schedule_pool)
 .set_default(WrapSchedule(topi::generic::default_schedule))
 .register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))
diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py
new file mode 100644
index 000000000000..f699d6aa8dcb
--- /dev/null
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -0,0 +1,53 @@
+"""Test code for batch_matmul operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+from common import get_all_backend
+
+def verify_batch_matmul(batch, M, N, K):
+    x = tvm.placeholder((batch, M, K), name='x')
+    y = tvm.placeholder((batch, N, K), name='y')
+    dtype = x.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_batch_matmul")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, M, K)).astype(dtype)
+        b_np = np.random.uniform(size=(batch, N, K)).astype(dtype)
+        c_np = topi.testing.batch_matmul(a_np, b_np)
+        return (a_np, b_np, c_np)
+    # get the test data
+    a_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            out = topi.nn.batch_matmul(x, y)
+            s = topi.generic.schedule_batch_matmul([out])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [x, y, out], device, name="dense")
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def test_batch_matmul():
+    verify_batch_matmul(1, 16, 16, 32)
+    verify_batch_matmul(5, 16, 16, 32)
+    verify_batch_matmul(5, 16, 20, 32)
+    verify_batch_matmul(30, 16, 20, 32)
+
+
+if __name__ == "__main__":
+    test_batch_matmul()
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index a3af43c8d810..73c1fdae2d66 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -105,7 +105,7 @@ def check_device(device):
                              name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                                   (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func(a, w, c)
-        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
 
     # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
     for device in ["llvm"]:
@@ -202,4 +202,4 @@ def test_conv2d_NCHWc():
     verify_conv2d_NCHWc(1,  256,   3, 126, 3, 1, 1)
 
 if __name__ == "__main__":
-    test_conv2d_NCHWc()
\ No newline at end of file
+    test_conv2d_NCHWc()
diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py
deleted file mode 100644
index 3357382b232e..000000000000
--- a/topi/tests/python/test_topi_region.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Example code to do region."""
-import numpy as np
-import topi
-from topi.util import get_const_tuple
-import tvm
-import topi.testing
-
-def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
-    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    B = topi.vision.yolo.region(A, n, classes, coords, background, l_softmax)
-
-    a_shape = get_const_tuple(A.shape)
-    dtype = A.dtype
-
-    def get_ref_data_region():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_region()
-    def check_device(device):
-        '''Cheching devices is enabled or not'''
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            if device == 'llvm':
-                s = topi.generic.vision.schedule_region([B])
-            else:
-                s = topi.cuda.vision.schedule_region([B])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device)
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['llvm', 'cuda']:
-        check_device(device)
-
-def test_region():
-    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
-
-if __name__ == "__main__":
-    test_region()
diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py
deleted file mode 100644
index f89aa46a1e66..000000000000
--- a/topi/tests/python/test_topi_shortcut.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Example code to do shortcut."""
-import numpy as np
-import topi
-from topi.util import get_const_tuple
-import tvm
-
-def verify_shortcut(batch, in_size, in_channel):
-    '''Verify shortcut operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A1 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A1')
-    A2 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A2')
-    B = topi.vision.shortcut(A1, A2)
-
-    a_shape = get_const_tuple(A1.shape)
-    dtype = A1.dtype
-    def get_ref_data_shortcut():
-        a_np1 = np.random.uniform(size=a_shape).astype(dtype)
-        a_np2 = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.shortcut_python(a_np1, a_np2)
-        return a_np1, a_np2, b_np
-
-    a_np1, a_np2, b_np = get_ref_data_shortcut()
-    def check_device(device):
-        '''Cheching devices is enabled or not'''
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective([B])
-
-        a1 = tvm.nd.array(a_np1, ctx)
-        a2 = tvm.nd.array(a_np2, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A1, A2, B], device)
-        func(a1, a2, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['llvm', 'cuda']:
-        check_device(device)
-
-def test_shortcut():
-    verify_shortcut(1, 144, 32)
-
-if __name__ == "__main__":
-    test_shortcut()
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 84d4aa6dc952..785da6fddbcf 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -124,6 +124,31 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+def verify_stack(shapes, axis):
+    tensor_l = []
+    for i, shape in enumerate(shapes):
+        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+    out_tensor = topi.stack(tensor_l, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(out_tensor)
+
+        foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack")
+        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
+        out_npy = np.stack(data_npys, axis=axis)
+        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
+        foo(*(data_nds + [out_nd]))
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
 
 def verify_split(src_shape, indices_or_sections, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
@@ -304,6 +329,80 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+def verify_arange(start, stop, step):
+    if start is None and step is None:
+        A = topi.arange(stop)
+        a_np = np.arange(stop)
+    elif start is None:
+        A = topi.arange(stop, step=step)
+        a_np = np.arange(stop, step=step)
+    elif step is None:
+        A = topi.arange(start, stop)
+        a_np = np.arange(start, stop)
+    else:
+        A = topi.arange(start, stop, step)
+        a_np = np.arange(start, stop, step)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(A)
+        f = tvm.build(s, [A], device, name="arange")
+        a_nd = tvm.nd.empty(a_np.shape, dtype='float32', ctx=ctx)
+        f(a_nd)
+        tvm.testing.assert_allclose(a_nd.asnumpy(), a_np)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def verify_repeat(in_shape, repeats, axis):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.repeat(A, repeats, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
+        foo = tvm.build(s, [A, B], device, name="repeat")
+        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = np.repeat(data_npy, repeats, axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def verify_tile(in_shape, reps):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.tile(A, reps)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
+        foo = tvm.build(s, [A, B], device, name="tile")
+        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = np.tile(data_npy, reps)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
@@ -353,7 +452,7 @@ def test_squeeze():
 
 
 def test_concatenate():
-    verify_concatenate([(2,), (2,), (2,)], 0)
+    verify_concatenate([(2,), (2,), (2,)], -1)
     verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
     verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
     verify_concatenate([(5, 6, 7, 3),
@@ -363,6 +462,14 @@ def test_concatenate():
                         (2, 6, 7, 3)], 0)
 
 
+def test_stack():
+    verify_stack([(2,), (2,), (2,)], -1)
+    verify_stack([(2,), (2,), (2,)], 1)
+    verify_stack([(2,), (2,), (2,)], 0)
+    verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1)
+    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
+
+
 def test_split():
     verify_split((2, 12, 3), 3, 1)
     verify_split((2, 12, 3), [2, 4], 1)
@@ -407,9 +514,60 @@ def test_gather_nd():
         verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]],
                          indices_dtype)
 
+def test_arange():
+    verify_arange(None, 20, None)
+    verify_arange(None, 20, 2)
+    verify_arange(1, 20, None)
+    verify_arange(1, 20, 2)
+    verify_arange(1, 20, 1.5)
+    verify_arange(1, 20.5, None)
+    verify_arange(1, 20, 3)
+    verify_arange(20, 1, -1)
+    verify_arange(20, 1, -1.5)
+
+def test_repeat():
+    verify_repeat((2,), 1, 0)
+    verify_repeat((3, 2), 2, 0)
+    verify_repeat((3, 2, 4), 3, 1)
+    verify_repeat((1, 3, 2, 4), 4, -1)
+
+def test_tile():
+    verify_tile((3, 2), (2, 3))
+    verify_tile((3, 2, 5), (2,))
+    verify_tile((3, ), (2, 3, 3))
+
+def test_layout_transform():
+    in_shape = (1, 32, 8, 8)
+    A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
+    B = topi.layout_transform(A, "NCHW", "NCHW16c")
+
+    input = np.random.uniform(size=in_shape).astype(A.dtype)
+    output = np.transpose(input, axes=(0, 2, 3, 1))
+    output = np.reshape(output, newshape=(1, 8, 8, 2, 16))
+    output = np.transpose(output, axes=(0, 3, 1, 2, 4))
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        tvm_input = tvm.nd.array(input, ctx)
+        tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype)
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+        f = tvm.build(s, [A, B], device, name="layout_transform")
+        f(tvm_input, tvm_output)
+        tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
+
+    for backend in get_all_backend():
+        check_device(backend)
+
+
 if __name__ == "__main__":
     test_strided_slice()
     test_concatenate()
+    test_stack()
     test_tranpose()
     test_expand_dims()
     test_reshape()
@@ -419,3 +577,7 @@ def test_gather_nd():
     test_expand_like()
     test_take()
     test_gather_nd()
+    test_arange()
+    test_layout_transform()
+    test_repeat()
+    test_tile()
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 135b3857df31..02e04212b63e 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -8,11 +8,62 @@
 
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from topi.vision import ssd, nms
+from topi.vision import ssd, non_max_suppression, get_valid_counts
+
+
+def verify_get_valid_counts(dshape, score_threshold):
+    dtype = "float32"
+    batch_size, num_anchor, elem_length = dshape
+    np_data = np.random.uniform(size=dshape).astype(dtype)
+    np_out1 = np.zeros(shape=(batch_size,))
+    np_out2 = np.zeros(shape=dshape).astype(dtype)
+    for i in range(batch_size):
+        np_out1[i] = 0
+        inter_idx = 0
+        for j in range(num_anchor):
+            score = np_data[i, j, 1]
+            if score > score_threshold:
+                for k in range(elem_length):
+                    np_out2[i, inter_idx, k] = np_data[i, j, k]
+                np_out1[i] += 1
+                inter_idx += 1
+            if j >= np_out1[i]:
+                for k in range(elem_length):
+                    np_out2[i, j, k] = -1.0
 
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            data = tvm.placeholder(dshape, name="data", dtype=dtype)
+            outs = get_valid_counts(data, score_threshold)
+            s = topi.generic.schedule_multibox_prior(outs)
+
+        tvm_input_data = tvm.nd.array(np_data, ctx)
+        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
+        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
+        f = tvm.build(s, [data, outs[0], outs[1]], device)
+        f(tvm_input_data, tvm_out1, tvm_out2)
+        tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-def test_nms():
+    for device in ['llvm']:
+        check_device(device)
+
+
+def test_get_valid_counts():
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
+
+def test_non_max_suppression():
     dshape = (1, 5, 6)
+    indices_dshape = (1, 5)
     data = tvm.placeholder(dshape, name="data")
     valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
     nms_threshold = 0.7
@@ -24,8 +75,9 @@ def test_nms():
                          [1, 0.5, 100, 60, 70, 110]]]).astype(data.dtype)
     np_valid_count = np.array([4]).astype(valid_count.dtype)
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -35,18 +87,27 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
+            indices_s = topi.generic.schedule_nms(indices_out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f = tvm.build(s, [data, valid_count, out], device)
         f(tvm_data, tvm_valid_count, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
+        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
+        f = tvm.build(indices_s, [data, valid_count, indices_out], device)
+        f(tvm_data, tvm_valid_count, tvm_indices_out)
+        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+
     for device in ['llvm']:
         check_device(device)
 
@@ -210,7 +271,7 @@ def test_roi_align():
 def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
     cls_prob = tvm.placeholder(np_cls_prob.shape)
     bbox_pred = tvm.placeholder(np_bbox_pred.shape)
-    im_info = tvm.placeholder(np_im_info.shape, dtype='int32')
+    im_info = tvm.placeholder(np_im_info.shape)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -252,7 +313,7 @@ def test_proposal():
         [[1.0, 0.5, 0.7], [1.5,  0.9, 1.6], [1.4, 1.5, 0.8]],
         [[1.0, 0.5, 0.6], [1.5,  0.9, 2.0], [1.8, 1.0, 0.9]],
     ]], dtype='float32')
-    np_im_info = np.array([[48, 48, 1]], dtype='int32')
+    np_im_info = np.array([[48., 48., 1.]], dtype='float32')
     np_out = np.array([
         [0., 0., 2.8451548,28.38012, 18.154846],
         [0., 0., 15.354933, 41.96971, 41.245064],
@@ -274,7 +335,8 @@ def test_proposal():
 
 
 if __name__ == "__main__":
-    test_nms()
+    test_get_valid_counts()
+    test_non_max_suppression()
     test_multibox_prior()
     test_multibox_detection()
     test_roi_align()
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
new file mode 100644
index 000000000000..6a5d63b9f8cf
--- /dev/null
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -0,0 +1,104 @@
+"""
+Deploy Single Shot Multibox Detector(SSD) model
+===============================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This article is an introductory tutorial to deploy SSD models with TVM.
+We will use GluonCV pre-trained SSD model and convert it to Relay IR
+"""
+import tvm
+
+from matplotlib import pyplot as plt
+from nnvm import compiler
+from nnvm.frontend import from_mxnet
+from nnvm.testing.config import ctx_list
+from tvm import relay
+from tvm.contrib import graph_runtime
+from gluoncv import model_zoo, data, utils
+
+
+######################################################################
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
+# .. note::
+#
+#   Currently we support compiling SSD on CPU only.
+#   GPU support is in progress.
+#
+#   To get best inference performance on CPU, change
+#   target argument according to your device and
+#   follow the :ref:`tune_relay_x86` to tune x86 CPU and
+#   :ref:`tune_relay_arm` for arm cpu.
+#
+#   SSD with VGG as body network is not supported yet since
+#   x86 conv2d schedule doesn't support dilation.
+
+supported_model = [
+    'ssd_512_resnet18_v1_voc',
+    'ssd_512_resnet18_v1_coco',
+    'ssd_512_resnet50_v1_voc',
+    'ssd_512_resnet50_v1_coco',
+    'ssd_512_resnet101_v2_voc',
+    'ssd_512_mobilenet1_0_voc',
+    'ssd_512_mobilenet1_0_coco',
+]
+
+model_name = "ssd_512_resnet50_v1_voc"
+dshape = (1, 3, 512, 512)
+dtype = "float32"
+target_list = ctx_list()
+
+######################################################################
+# Download and pre-process demo image
+
+im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
+                          'gluoncv/detection/street_small.jpg?raw=true',
+                          path='street_small.jpg')
+x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
+
+######################################################################
+# Convert and compile model for CPU.
+
+block = model_zoo.get_model(model_name, pretrained=True)
+
+def compile(target):
+    net, params = relay.frontend.from_mxnet(block, {"data": dshape})
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+    return graph, lib, params
+
+######################################################################
+# Create TVM runtime and do inference
+
+def run(graph, lib, params, ctx):
+    # Build TVM runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)
+    m.set_input('data', tvm_input)
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)
+    return class_IDs, scores, bounding_boxs
+
+for target, ctx in target_list:
+    if target == "cuda":
+        print("GPU not supported yet, skip.")
+        continue
+    graph, lib, params = compile(target)
+    class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
+
+######################################################################
+# Display result
+
+ax = utils.viz.plot_bbox(img, bounding_boxs.asnumpy()[0], scores.asnumpy()[0],
+                         class_IDs.asnumpy()[0], class_names=block.classes)
+plt.show()
diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
new file mode 100644
index 000000000000..a79e21921068
--- /dev/null
+++ b/tutorials/frontend/from_coreml.py
@@ -0,0 +1,101 @@
+"""
+Compile CoreML Models
+=====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_, \
+            `Kazutaka Morita <https://github.com/kazum>`_
+
+This article is an introductory tutorial to deploy CoreML models with Relay.
+
+For us to begin with, coremltools module is required to be installed.
+
+A quick solution is to install via pip
+
+.. code-block:: bash
+
+    pip install -U coremltools --user
+
+or please refer to official site
+https://github.com/apple/coremltools
+"""
+import tvm
+import tvm.relay as relay
+import coremltools as cm
+import numpy as np
+from PIL import Image
+
+def download(url, path, overwrite=False):
+    import os
+    if os.path.isfile(path) and not overwrite:
+        print('File {} existed, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, path)
+    except:
+        import urllib
+        urllib.urlretrieve(url, path)
+
+######################################################################
+# Load pretrained CoreML model
+# ----------------------------
+# We will download and load a pretrained mobilenet classification network
+# provided by apple in this example
+model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+model_file = 'mobilenet.mlmodel'
+download(model_url, model_file)
+# Now you have mobilenet.mlmodel on disk
+mlmodel = cm.models.MLModel(model_file)
+
+######################################################################
+# Load a test image
+# ------------------
+# A single cat dominates the examples!
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
+x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+
+######################################################################
+# Compile the model on Relay
+# ---------------------------
+# We should be familiar with the process right now.
+target = 'cuda'
+shape_dict = {'image': x.shape}
+
+# Parse CoreML model and convert into Relay computation graph
+func, params = relay.frontend.from_coreml(mlmodel, shape_dict)
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+######################################################################
+# Execute on TVM
+# -------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('image', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+print('Top-1 id', top1, 'class name', synset[top1])
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
new file mode 100644
index 000000000000..a465350a0df8
--- /dev/null
+++ b/tutorials/frontend/from_mxnet.py
@@ -0,0 +1,120 @@
+"""
+.. _tutorial-from-mxnet:
+
+Compile MXNet Models
+====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_, \
+            `Kazutaka Morita <https://github.com/kazum>`_
+
+This article is an introductory tutorial to deploy mxnet models with Relay.
+
+For us to begin with, mxnet module is required to be installed.
+
+A quick solution is
+
+.. code-block:: bash
+
+    pip install mxnet --user
+
+or please refer to offical installation guide.
+https://mxnet.incubator.apache.org/versions/master/install/index.html
+"""
+# some standard imports
+import mxnet as mx
+import tvm
+import tvm.relay as relay
+import numpy as np
+
+######################################################################
+# Download Resnet18 model from Gluon Model Zoo
+# ---------------------------------------------
+# In this section, we download a pretrained imagenet model and classify an image.
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+from matplotlib import pyplot as plt
+block = get_model('resnet18_v1', pretrained=True)
+img_name = 'cat.png'
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+image = Image.open(img_name).resize((224, 224))
+plt.imshow(image)
+plt.show()
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+print('x', x.shape)
+
+######################################################################
+# Compile the Graph
+# -----------------
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+shape_dict = {'data': x.shape}
+func, params = relay.frontend.from_mxnet(block, shape_dict)
+## we want a probability so add a softmax operator
+func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
+
+######################################################################
+# now compile the graph
+target = 'cuda'
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now, we would like to reproduce the same forward computation using TVM.
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('data', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+print('TVM prediction top-1:', top1, synset[top1])
+
+######################################################################
+# Use MXNet symbol with pretrained weights
+# ----------------------------------------
+# MXNet often use `arg_params` and `aux_params` to store network parameters
+# separately, here we show how to use these weights with existing API
+def block2symbol(block):
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    args = {}
+    auxs = {}
+    for k, v in block.collect_params().items():
+        args[k] = mx.nd.array(v.data().asnumpy())
+    return sym, args, auxs
+mx_sym, args, auxs = block2symbol(block)
+# usually we would save/load it as checkpoint
+mx.model.save_checkpoint('resnet18_v1', 0, mx_sym, args, auxs)
+# there are 'resnet18_v1-0000.params' and 'resnet18_v1-symbol.json' on disk
+
+######################################################################
+# for a normal mxnet model, we start from here
+mx_sym, args, auxs = mx.model.load_checkpoint('resnet18_v1', 0)
+# now we use the same API to get Relay computation graph
+relay_func, relay_params = relay.frontend.from_mxnet(mx_sym, shape_dict,
+                                                     arg_params=args, aux_params=auxs)
+# repeat the same steps to run this model using TVM
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd_mxnet.py
similarity index 98%
rename from tutorials/nnvm/deploy_ssd.py
rename to tutorials/nnvm/deploy_ssd_mxnet.py
index eadb8fd28e0c..1a71c96eaa0c 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd_mxnet.py
@@ -61,7 +61,7 @@
 image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
             "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
 inference_symbol_folder = \
-"c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
 
diff --git a/tutorials/relay/deploy_model_on_rasp.py b/tutorials/relay/deploy_model_on_rasp.py
new file mode 100644
index 000000000000..b90127b3858e
--- /dev/null
+++ b/tutorials/relay/deploy_model_on_rasp.py
@@ -0,0 +1,207 @@
+"""
+.. _tutorial-deploy-model-on-rasp:
+
+Deploy the Pretrained Model on Raspberry Pi
+===========================================
+**Author**: `Ziheng Jiang <https://ziheng.org/>`_, \
+            `Hiroyuki Makino <https://makihiro.github.io/>`_
+
+This is an example of using Relay to compile a ResNet model and deploy
+it on Raspberry Pi.
+"""
+
+import tvm
+import tvm.relay as relay
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+
+######################################################################
+# .. _build-tvm-runtime-on-device:
+#
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   mkdir build
+#   cp cmake/config.cmake build
+#   cd build
+#   cmake ..
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+import numpy as np
+
+# one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.png'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+image = Image.open(img_name).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+shape_dict = {'data': x.shape}
+func, params = relay.frontend.from_mxnet(block, shape_dict)
+# we want a probability so add a softmax operator
+func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`relay.build` function
+# with the graph configuration and parameters. However, You cannot to
+# deploy a x86 program on a device with ARM instruction set. It means
+# Relay also needs to know the compilation option of target device,
+# apart from arguments :code:`net` and :code:`params` to specify the
+# deep learning workload. Actually, the option matters, different option
+# will lead to very different performance.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target = tvm.target.create('llvm')
+else:
+    target = tvm.target.arm_cpu('rasp3b')
+    # The above line is a simple form of
+    # target = tvm.target.create('llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+# After `relay.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+# create the remote runtime module
+ctx = remote.cpu(0)
+module = runtime.create(graph, rlib, ctx)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/relay/using_external_lib.py b/tutorials/relay/using_external_lib.py
new file mode 100644
index 000000000000..fb4b52ea5cf1
--- /dev/null
+++ b/tutorials/relay/using_external_lib.py
@@ -0,0 +1,544 @@
+"""
+Using External Libraries in Relay
+================================
+**Author**: `Masahiro Masuda <https://github.com/masahi>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with Relay.
+
+Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For Relay users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary.
+
+To begin with, we import Relay and TVM.
+"""
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime as runtime
+from tvm import relay
+from tvm.relay import testing
+
+######################################################################
+# Create a simple network
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+
+out_channels = 16
+batch_size = 1
+
+data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
+weight = relay.var("weight")
+bn_gamma = relay.var("bn_gamma")
+bn_beta = relay.var("bn_beta")
+bn_mmean = relay.var("bn_mean")
+bn_mvar = relay.var("bn_var")
+
+simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=out_channels, padding=(1, 1))
+simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+simple_net = relay.nn.relu(simple_net)
+simple_net = relay.Function(relay.ir_pass.free_vars(simple_net), simple_net)
+
+data_shape = (batch_size, 3, 224, 224)
+net, params = testing.create_workload(simple_net)
+
+######################################################################
+# Build and run with cuda backend
+# -------------------------------
+# We build and run this network with cuda backend, as usual.
+# By setting the logging level to DEBUG, the result of Relay graph compilation will be dumped as pseudo code.
+import logging
+logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+target = "cuda"
+graph, lib, params = relay.build_module.build(
+    net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cuda = out.asnumpy()
+######################################################################
+# The generated pseudo code should look something like below.
+# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
+# TVM generates a single, fused kernel from this representation.
+#
+# .. code-block:: text
+#
+#       produce tensor {
+#         // attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 1
+#         // attr [compute] storage_scope = "local"
+#         allocate compute[float32 * 32]
+#         // attr [pad_temp.shared] storage_scope = "shared"
+#         allocate pad_temp.shared[float32 * 180]
+#         // attr [placeholder.shared] storage_scope = "shared"
+#         allocate placeholder.shared[float32 * 144]
+#         // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28
+#         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14
+#         // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#         // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#         // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#         produce compute {
+#           compute[0] = 0.000000f
+#           compute[1] = 0.000000f
+#           compute[2] = 0.000000f
+#           compute[3] = 0.000000f
+#           compute[4] = 0.000000f
+#           compute[5] = 0.000000f
+#           compute[6] = 0.000000f
+#           compute[7] = 0.000000f
+#           compute[8] = 0.000000f
+#           compute[9] = 0.000000f
+#           compute[10] = 0.000000f
+#           compute[11] = 0.000000f
+#           compute[12] = 0.000000f
+#           compute[13] = 0.000000f
+#           compute[14] = 0.000000f
+#           compute[15] = 0.000000f
+#           compute[16] = 0.000000f
+#           compute[17] = 0.000000f
+#           compute[18] = 0.000000f
+#           compute[19] = 0.000000f
+#           compute[20] = 0.000000f
+#           compute[21] = 0.000000f
+#           compute[22] = 0.000000f
+#           compute[23] = 0.000000f
+#           compute[24] = 0.000000f
+#           compute[25] = 0.000000f
+#           compute[26] = 0.000000f
+#           compute[27] = 0.000000f
+#           compute[28] = 0.000000f
+#           compute[29] = 0.000000f
+#           compute[30] = 0.000000f
+#           compute[31] = 0.000000f
+#           for (rc.outer, 0, 3) {
+#             produce pad_temp.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*15) < (60 - threadIdx.x)))) {
+#                 if (likely((threadIdx.x < 15))) {
+#                   pad_temp.shared[(((((threadIdx.z*15) + threadIdx.x)/60)*180) + ((((((threadIdx.z*15) + threadIdx.x)/6) % 10)*18) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))] = tvm_if_then_else((((((1 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)))) && ((1 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((threadIdx.z*15) + threadIdx.x)/60)*9408))*16) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) + (((((threadIdx.z*15) + threadIdx.x)/6) % 10)*224)) + -225)], 0.000000f)
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*224)) + -225)], 0.000000f)
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*224)) + -225)], 0.000000f)
+#                 }
+#               }
+#             }
+#             produce placeholder.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*4) < (16 - (threadIdx.x/3))))) {
+#                 if (likely(((threadIdx.z*12) < (48 - threadIdx.x)))) {
+#                   if (likely((threadIdx.x < 12))) {
+#                     placeholder.shared[(((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3)] = placeholder[(((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 1)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 1)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 2)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 2)]
+#                   }
+#                 }
+#               }
+#             }
+#             compute[0] = (compute[0] + (pad_temp.shared[threadIdx.x]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#           }
+#         }
+#         tensor[(((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x)] = max(((compute[0]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 224)] = max(((compute[1]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 448)] = max(((compute[2]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 672)] = max(((compute[3]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 896)] = max(((compute[4]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1120)] = max(((compute[5]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1344)] = max(((compute[6]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1568)] = max(((compute[7]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50176)] = max(((compute[8]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50400)] = max(((compute[9]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50624)] = max(((compute[10]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50848)] = max(((compute[11]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51072)] = max(((compute[12]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51296)] = max(((compute[13]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51520)] = max(((compute[14]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51744)] = max(((compute[15]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100352)] = max(((compute[16]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100576)] = max(((compute[17]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100800)] = max(((compute[18]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101024)] = max(((compute[19]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101248)] = max(((compute[20]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101472)] = max(((compute[21]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101696)] = max(((compute[22]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101920)] = max(((compute[23]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150528)] = max(((compute[24]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150752)] = max(((compute[25]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150976)] = max(((compute[26]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151200)] = max(((compute[27]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151424)] = max(((compute[28]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151648)] = max(((compute[29]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151872)] = max(((compute[30]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 152096)] = max(((compute[31]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#       }
+
+######################################################################
+# Use cuDNN for a convolutional layer
+# -----------------------------------
+# We can use cuDNN to replace convolution kernels with cuDNN ones.
+# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
+net, params = testing.create_workload(simple_net)
+target = "cuda -libs=cudnn" # use cudnn for convolution
+graph, lib, params = relay.build_module.build(
+        net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cudnn = out.asnumpy()
+
+######################################################################
+# Note that if you use cuDNN, Relay cannot fuse convolution with layers following it.
+# This is because layer fusion happens at the level of TVM internal representation(IR).
+# Relay treats external libraries as black box, so there is no way to fuse them with TVM IR.
+#
+# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
+#
+# .. code-block:: text
+#
+#      // attr [y] storage_scope = "global"
+#      allocate y[float32 * 802816]
+#      produce y {
+#        // attr [0] extern_scope = 0
+#        tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(placeholder, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(placeholder, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
+#      }
+#      produce tensor {
+#        // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 256
+#        // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
+#        for (ax0.ax1.fused.ax2.fused.ax3.fused.outer, 0, 7) {
+#          if (likely(((blockIdx.x*512) < ((802816 - (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072)) - threadIdx.x)))) {
+#            tensor[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))] = max(((y[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))]*placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]) + placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]), 0.000000f)
+#          }
+#        }
+#      }
+
+
+######################################################################
+# Verify the result
+# -----------------
+# We can check that the results of two runs match.
+
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+
+#####################################################################
+# Conclusion
+# ----------
+# This tutorial covered the usage of cuDNN with Relay.
+# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (relay.dense).
+# To use cuBLAS, set a target string as "cuda -libs=cublas".
+# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
+#
+# For ROCm backend, we have support for MIOpen and rocBLAS.
+# They can be enabled with target "rocm -libs=miopen,rocblas".
+#
+# Being able to use external libraries is great, but we need to keep in mind some cautions.
+#
+# First, the use of external libraries may restrict your usage of TVM and Relay.
+# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
+#
+# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
+# TVM and Relay aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
+# To achieve this goal, we should continue developing better optimizations for TVM and Relay, while using external libraries as a nice way to fall back to existing implementation when necessary.
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
new file mode 100644
index 000000000000..286114fe997f
--- /dev/null
+++ b/tutorials/relay_quick_start.py
@@ -0,0 +1,145 @@
+"""
+.. _tutorial-relay-quick-start:
+
+Quick Start Tutorial for Compiling Deep Learning Models
+======================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This example shows how to build a neural network with Relay python frontend and
+generates a runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
+"""
+
+######################################################################
+# Overview for Supported Hardware Backend of TVM
+# ----------------------------------------------
+# The image below shows hardware backend currently supported by TVM:
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
+#      :align: center
+#      :scale: 100%
+#
+# In this tutorial, we'll choose cuda and llvm as target backends.
+# To begin with, let's import Relay and TVM.
+
+import numpy as np
+
+from tvm import relay
+from tvm.relay import testing
+import tvm
+from tvm.contrib import graph_runtime
+
+######################################################################
+# Define Neural Network in Relay
+# -----------------------------
+# First, let's define a neural network with relay python frontend.
+# For simplicity, we'll use pre-defined resnet-18 network in Relay.
+# Parameters are initialized with Xavier initializer.
+# Relay also supports other model formats such as MXNet, CoreML, ONNX and
+# Tensorflow.
+#
+# In this tutorial, we assume we will do inference on our device
+# and the batch size is set to be 1. Input images are RGB color
+# images of size 224 * 224. We can call the :any:`tvm.relay.expr.astext()`
+# to show the network structure.
+
+batch_size = 1
+num_class = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_class)
+
+net, params = relay.testing.resnet.get_workload(
+    num_layers=18, batch_size=batch_size, image_shape=image_shape)
+
+# set show_meta_data=True if you want to show meta data
+print(net.astext(show_meta_data=False))
+
+######################################################################
+# Compilation
+# -----------
+# Next step is to compile the model using the Relay/TVM pipeline.
+# Users can specify the optimization level of the compilation.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
+#
+# :any:`relay.build_module.build` returns three components: the execution graph in
+# json format, the TVM module library of compiled functions specifically
+# for this graph on the target hardware, and the parameter blobs of
+# the model. During the compilation, Relay does the graph-level
+# optimization while TVM does the tensor-level optimization, resulting
+# in an optimized runtime module for model serving.
+#
+# We'll first compile for Nvidia GPU. Behind the scene, `relay.build_module.build`
+# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
+# then registers the operators (i.e. the nodes of the optimized graphs) to
+# TVM implementations to generate a `tvm.module`.
+# To generate the module library, TVM will first transfer the high level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.
+
+opt_level = 3
+target = tvm.target.cuda()
+with relay.build_config(opt_level=opt_level):
+    graph, lib, params = relay.build_module.build(
+        net, target, params=params)
+
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph runtime and run the module on Nvidia GPU.
+
+# create random input
+ctx = tvm.gpu()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_runtime.create(graph, lib, ctx)
+# set input and parameters
+module.set_input("data", data)
+module.set_input(**params)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()
+
+# Print first 10 elements of output
+print(out.flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in deploy environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
+from tvm.contrib import util
+
+temp = util.tempdir()
+path_lib = temp.relpath("deploy_lib.tar")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy_graph.json"), "w") as fo:
+    fo.write(graph)
+with open(temp.relpath("deploy_param.params"), "wb") as fo:
+    fo.write(relay.save_param_dict(params))
+print(temp.listdir())
+
+####################################################
+
+# load the module back.
+loaded_json = open(temp.relpath("deploy_graph.json")).read()
+loaded_lib = tvm.module.load(path_lib)
+loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
+input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
+
+module = graph_runtime.create(loaded_json, loaded_lib, ctx)
+module.load_params(loaded_params)
+module.run(data=input_data)
+out_deploy = module.get_output(0).asnumpy()
+
+# Print first 10 elements of output
+print(out_deploy.flatten()[0:10])
+
+# check whether the output from deployed module is consistent with original one
+tvm.testing.assert_allclose(out_deploy, out, atol=1e-3)
diff --git a/version.py b/version.py
index acdc3f435798..b0c0b2af109e 100644
--- a/version.py
+++ b/version.py
@@ -16,7 +16,7 @@
 # current version
 # We use the version of the incoming release for code
 # that is under development
-__version__ = "0.5"
+__version__ = "0.6.dev"
 
 # Implementations
 def update(file_name, pattern, repl):
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index a77e29ac3a52..3a145be590d5 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -223,10 +223,9 @@ def target_host(self):
         """The target host"""
         if self.TARGET == "pynq":
             return "llvm -target=armv7-none-linux-gnueabihf"
-        elif self.TARGET == "sim":
+        if self.TARGET == "sim":
             return "llvm"
-        else:
-            raise ValueError("Unknown target %s" % self.TARGET)
+        raise ValueError("Unknown target %s" % self.TARGET)
 
 
 def get_env():
diff --git a/vta/python/vta/graph.py b/vta/python/vta/graph.py
index 7f2a26fdc4bf..0b746e0458af 100644
--- a/vta/python/vta/graph.py
+++ b/vta/python/vta/graph.py
@@ -169,7 +169,7 @@ def _clean_cast(node, target_type):
         op_name = node.attr("op_name")
         if op_name == "cast":
             return _clean_cast(node.get_children(), target_type)
-        elif op_name == "relu":
+        if op_name == "relu":
             data, has_clip = _clean_cast(
                 node.get_children(), target_type)
             data = nnvm.sym.relu(data)
diff --git a/vta/python/vta/intrin.py b/vta/python/vta/intrin.py
index b366287568e7..8255b8b7df2e 100644
--- a/vta/python/vta/intrin.py
+++ b/vta/python/vta/intrin.py
@@ -64,7 +64,7 @@ def instr(index):
                            dev.get_task_qid(dev.QID_COMPUTE))
             irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                            dev.vta_push_uop)
-            if index == 0 or index == 2:
+            if index in (0, 2):
                 irb.emit(tvm.call_extern(
                     "int32", "VTAUopPush",
                     0, 0,
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index c21ca6ed5bf4..9800cc6472b3 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -3,7 +3,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from topi import util as util
+from topi import util
 
 from .environment import get_env
 
@@ -77,10 +77,9 @@ def _post_order(op):
                         args.append(m[1])
                 args += op.args[base_args+3:]
                 return tvm.call_extern("int32", "VTAUopPush", *args)
-            else:
-                if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
-                    raise RuntimeError("unexpected op %s" % op)
-                return op
+            if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
+                raise RuntimeError("unexpected op %s" % op)
+            return op
 
         ret = tvm.ir_pass.IRTransform(
             stmt.body, None, _post_order, ["Call"])
@@ -165,22 +164,21 @@ def _post_order(op):
                 op.condition, let_stmt)
             del rw_info[buffer_var]
             return alloc
-        elif isinstance(op, tvm.expr.Load):
+        if isinstance(op, tvm.expr.Load):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
                 rw_info[buffer_var] = tvm.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.make.Load(op.dtype, new_var, op.index)
-        elif isinstance(op, tvm.stmt.Store):
+        if isinstance(op, tvm.stmt.Store):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
                 rw_info[buffer_var] = tvm.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.make.Store(new_var, op.value, op.index)
-        else:
-            raise RuntimeError("not reached")
+        raise RuntimeError("not reached")
     stmt = tvm.ir_pass.IRTransform(
         stmt_in, None, _post_order, ["Allocate", "Load", "Store"])
     for buffer_var, new_var in rw_info.items():
@@ -233,23 +231,20 @@ def _pre_order(op):
             if op.attr_key == "virtual_thread":
                 lift_stmt.append([])
 
-        return None
-
     def _post_order(op):
         if isinstance(op, tvm.stmt.Allocate):
             lift_stmt[-1].append(op)
             return op.body
-        elif isinstance(op, tvm.stmt.AttrStmt):
+        if isinstance(op, tvm.stmt.AttrStmt):
             if op.attr_key == "storage_scope":
                 lift_stmt[-1].append(op)
                 return op.body
-            elif op.attr_key == "virtual_thread":
+            if op.attr_key == "virtual_thread":
                 return _merge_block(lift_stmt.pop() + [op], op.body)
             return op
-        elif isinstance(op, tvm.stmt.For):
+        if isinstance(op, tvm.stmt.For):
             return _merge_block(lift_stmt.pop() + [op], op.body)
-        else:
-            raise RuntimeError("not reached")
+        raise RuntimeError("not reached")
     stmt = tvm.ir_pass.IRTransform(
         stmt_in, _pre_order, _post_order, ["Allocate", "AttrStmt", "For"])
     assert len(lift_stmt) == 1
@@ -297,7 +292,7 @@ def _do_fold(stmt):
             sync = tvm.make.Call(
                 "int32", "vta.coproc_sync", [], tvm.expr.Call.Intrinsic, None, 0)
             return tvm.make.Block(stmt.body, tvm.make.Evaluate(sync))
-        elif _match_pragma(stmt, "trim_loop"):
+        if _match_pragma(stmt, "trim_loop"):
             op = stmt.body
             assert isinstance(op, tvm.stmt.For)
             return tvm.make.For(
@@ -584,7 +579,7 @@ def _do_fold(stmt):
                            tvm.make.StringImm("VTAPushALUOp"))
             irb.emit(stmt)
             return irb.get()
-        elif _match_pragma(stmt, "skip_alu"):
+        if _match_pragma(stmt, "skip_alu"):
             return tvm.make.Evaluate(0)
         return stmt
 
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index ab06cadf8247..2fd11a887da0 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -293,10 +293,9 @@ def schedule_conv2d(attrs, outs, target):
         target = tvm.target.create(target)
         if target.device_name == "vta":
             return schedule_packed_conv2d(outs)
-        elif str(target).startswith("llvm"):
+        if str(target).startswith("llvm"):
             return tvm.create_schedule([x.op for x in outs])
-        else:
-            raise RuntimeError("not support target %s" % target)
+        raise RuntimeError("not support target %s" % target)
     return _nn.schedule_conv2d(attrs, outs, target)
 
 
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index 2eab15093b72..fe303d57b0c6 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -2,7 +2,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.5.dev
+ * @version 0.6.dev
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */