diff --git a/.gitmodules b/.gitmodules
index 9aeb1c754983..836d824a6f5a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
 [submodule "3rdparty/tvm"]
 	path = 3rdparty/tvm
 	url = https://github.com/dmlc/tvm
+[submodule "3rdparty/onnx-tensorrt"]
+	path = 3rdparty/onnx-tensorrt
+	url = https://github.com/onnx/onnx-tensorrt.git
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 649be18a8c55..958c22b32c11 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 649be18a8c55c48517861d67158a45dec54992ee
+Subproject commit 958c22b32c116ec967a9247d09eddb9c21ea6d4f
diff --git a/3rdparty/onnx-tensorrt b/3rdparty/onnx-tensorrt
new file mode 160000
index 000000000000..e7be19cff377
--- /dev/null
+++ b/3rdparty/onnx-tensorrt
@@ -0,0 +1 @@
+Subproject commit e7be19cff377a95817503e8525e20de34cdc574a
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 000bbbf17ea5..8c3e635682a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ mxnet_option(ENABLE_CUDA_RTC      "Build with CUDA runtime compilation support"
 mxnet_option(BUILD_CPP_EXAMPLES   "Build cpp examples" ON)
 mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
 mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
+mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 
 message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
 if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
@@ -185,6 +186,36 @@ if(USE_VTUNE)
   list(APPEND mxnet_LINKER_LIBS dl)
 endif()
 
+if(USE_TENSORRT)
+  message(STATUS "Using TensorRT")
+  set(ONNX_PATH 3rdparty/onnx-tensorrt/third_party/onnx/build/)
+  set(ONNX_TRT_PATH 3rdparty/onnx-tensorrt/build/)
+
+  include_directories(${ONNX_PATH})
+  include_directories(3rdparty/onnx-tensorrt/)
+  include_directories(3rdparty/)
+  add_definitions(-DMXNET_USE_TENSORRT=1)
+  add_definitions(-DONNX_NAMESPACE=onnx)
+
+  find_package(Protobuf REQUIRED)
+
+  find_library(ONNX_LIBRARY NAMES libonnx.so REQUIRED
+          PATHS ${ONNX_PATH}
+          DOC "Path to onnx library.")
+  find_library(ONNX_PROTO_LIBRARY NAMES libonnx_proto.so REQUIRED
+          PATHS ${ONNX_PATH}
+          DOC "Path to onnx_proto library.")
+  find_library(ONNX_TRT_RUNTIME_LIBRARY NAMES libnvonnxparser_runtime.so REQUIRED
+          PATHS ${ONNX_TRT_PATH}
+          DOC "Path to onnx_proto library.")
+  find_library(ONNX_TRT_PARSER_LIBRARY NAMES libnvonnxparser.so REQUIRED
+          PATHS ${ONNX_TRT_PATH}
+          DOC "Path to onnx_proto library.")
+
+  list(APPEND mxnet_LINKER_LIBS libnvinfer.so ${ONNX_TRT_PARSER_LIBRARY} ${ONNX_TRT_RUNTIME_LIBRARY}
+          ${ONNX_PROTO_LIBRARY} ${ONNX_LIBRARY} ${PROTOBUF_LIBRARY})
+endif()
+
 if(USE_MKLDNN)
   include(cmake/MklDnn.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b04e4a3d85c3..6bc97bb71fc1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -176,3 +176,4 @@ List of Contributors
 * [Kou Ding](https://github.com/chinakook)
 * [Istvan Fehervari](https://github.com/ifeherva)
 * [Aaron Markham](https://github.com/aaronmarkham)
+* [Sam Skalicky](https://github.com/samskalicky)
diff --git a/Jenkinsfile b/Jenkinsfile
index 9d7792066e37..6eaee43df043 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -16,7 +16,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+//
 // Jenkins pipeline
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
@@ -30,120 +30,29 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 // timeout in minutes
 max_time = 120
-// assign any caught errors here
-err = null
-
-// initialize source codes
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
-      // retries as this will increase the amount of requests and worsen the throttling
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-def init_git_win() {
-  deleteDir()
-  retry(5) {
-    try {
-      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
-      // retries as this will increase the amount of requests and worsen the throttling
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        bat 'git submodule update --init --recursive'
-        bat 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-// pack libraries for later use
-def pack_lib(name, libs=mx_lib) {
-  sh """
-echo "Packing ${libs} into ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs=mx_lib) {
-  unstash name
-  sh """
-echo "Unpacked ${libs} from ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-}
-
-def publish_test_coverage() {
-    // Fall back to our own copy of the bash helper if it failed to download the public version
-    sh '(curl --retry 10 -s https://codecov.io/bash | bash -s -) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s -)'
-}
-
-def collect_test_results_unix(original_file_name, new_file_name) {
-    if (fileExists(original_file_name)) {
-        // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
-        // Thus, we have to pick a name manually and rename the files so that they can be stored separately.
-        sh 'cp ' + original_file_name + ' ' + new_file_name
-        archiveArtifacts artifacts: new_file_name
-    }
-}
-
-def collect_test_results_windows(original_file_name, new_file_name) {
-    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
-    // Thus, we have to pick a name manually and rename the files so that they can be stored separately.
-    if (fileExists(original_file_name)) {
-        bat 'xcopy ' + original_file_name + ' ' + new_file_name + '*'
-        archiveArtifacts artifacts: new_file_name
-    }
-}
-
 
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
-  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
-  command = command.replaceAll('%PLATFORM%', platform)
-  command = command.replaceAll('%FUNCTION_NAME%', function_name)
-  command = command.replaceAll('%SHARED_MEM%', shared_mem)
-
-  sh command
-}
 
 // Python unittest for CPU
 // Python 2
 def python2_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu', false)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu', false)
   }
 }
 
 // Python 3
 def python3_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu', false)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu', false)
   }
 }
 
 def python3_ut_mkldnn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
   }
 }
 
@@ -152,39 +61,47 @@ def python3_ut_mkldnn(docker_container_name) {
 // Python 2
 def python2_gpu_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python2_gpu', true)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_gpu', true)
   }
 }
 
 // Python 3
 def python3_gpu_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu', true)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu', true)
   }
 }
 
 // Python 3 NOCUDNN
 def python3_gpu_ut_nocudnn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
-    docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu_nocudnn', true)
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu_nocudnn', true)
   }
 }
 
-try {
+node('mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
+}
+utils.assign_node_labels(linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
+
+utils.main_wrapper(
+core_logic: {
   stage('Sanity Check') {
     parallel 'Lint': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/sanity-lint') {
-          init_git()
-          docker_run('ubuntu_cpu', 'sanity_check', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_cpu', 'sanity_check', false)
         }
       }
     },
     'RAT License': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/sanity-rat') {
-          init_git()
-          docker_run('ubuntu_rat', 'nightly_test_rat_check', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_rat', 'nightly_test_rat_check', false)
         }
       }
     }
@@ -192,142 +109,142 @@ try {
 
   stage('Build') {
     parallel 'CPU: CentOS 7': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('centos7_cpu', 'build_centos7_cpu', false)
-            pack_lib('centos7_cpu')
+            utils.init_git()
+            utils.docker_run('centos7_cpu', 'build_centos7_cpu', false)
+            utils.pack_lib('centos7_cpu', mx_lib)
           }
         }
       }
     },
     'CPU: CentOS 7 MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-centos7-mkldnn') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
-            pack_lib('centos7_mkldnn')
+            utils.init_git()
+            utils.docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
+            utils.pack_lib('centos7_mkldnn', mx_lib)
           }
         }
       }
     },
     'GPU: CentOS 7': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('centos7_gpu', 'build_centos7_gpu', false)
-            pack_lib('centos7_gpu')
+            utils.init_git()
+            utils.docker_run('centos7_gpu', 'build_centos7_gpu', false)
+            utils.pack_lib('centos7_gpu', mx_lib)
           }
         }
       }
     },
     'CPU: Openblas': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-openblas') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            pack_lib('cpu', mx_dist_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
+            utils.pack_lib('cpu', mx_dist_lib)
           }
         }
       }
     },
     'CPU: Openblas, debug': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-openblas') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
-            pack_lib('cpu_debug', mx_cmake_lib_debug)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
+            utils.pack_lib('cpu_debug', mx_cmake_lib_debug)
           }
         }
       }
     },
     'CPU: Clang 3.9': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-clang39') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
           }
         }
       }
     },
     'CPU: Clang 5': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-clang50') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50', false)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50', false)
           }
         }
       }
     },
     'CPU: Clang 3.9 MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-mkldnn-clang39') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
-            pack_lib('mkldnn_cpu_clang3', mx_mkldnn_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
+            utils.pack_lib('mkldnn_cpu_clang3', mx_mkldnn_lib)
           }
         }
       }
     },
     'CPU: Clang 5 MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-mkldnn-clang50') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50_mkldnn', false)
-            pack_lib('mkldnn_cpu_clang5', mx_mkldnn_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang50_mkldnn', false)
+            utils.pack_lib('mkldnn_cpu_clang5', mx_mkldnn_lib)
           }
         }
       }
     },
     'CPU: MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-mkldnn-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn', false)
-            pack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn', false)
+            utils.pack_lib('mkldnn_cpu', mx_mkldnn_lib)
           }
         }
       }
     },
     'GPU: MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
-            pack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
+            utils.pack_lib('mkldnn_gpu', mx_mkldnn_lib)
           }
         }
       }
     },
     'GPU: MKLDNN_CUDNNOFF': {
-       node('mxnetlinux-cpu') {
+       node(NODE_LINUX_CPU) {
          ws('workspace/build-mkldnn-gpu-nocudnn') {
            timeout(time: max_time, unit: 'MINUTES') {
-             init_git()
-             docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
-             pack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
+             utils.init_git()
+             utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
+             utils.pack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
            }
          }
        }
     },
     'GPU: CUDA9.1+cuDNN7': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
-            pack_lib('gpu', mx_dist_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
+            utils.pack_lib('gpu', mx_dist_lib)
             stash includes: 'build/cpp-package/example/lenet', name: 'cpp_lenet'
             stash includes: 'build/cpp-package/example/alexnet', name: 'cpp_alexnet'
             stash includes: 'build/cpp-package/example/googlenet', name: 'cpp_googlenet'
@@ -343,54 +260,65 @@ try {
       }
     },
     'Amalgamation MIN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/amalgamationmin') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation_min', false)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation_min', false)
           }
         }
       }
     },
     'Amalgamation': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/amalgamation') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation', false)
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation', false)
           }
         }
       }
     },
 
     'GPU: CMake MKLDNN': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cmake-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake_mkldnn', false)
-            pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake_mkldnn', false)
+            utils.pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
           }
         }
       }
     },
     'GPU: CMake': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-cmake-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
-            pack_lib('cmake_gpu', mx_cmake_lib)
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
+            utils.pack_lib('cmake_gpu', mx_cmake_lib)
+          }
+        }
+      }
+    },
+    'TensorRT': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-tensorrt') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu_tensorrt', 'build_ubuntu_gpu_tensorrt', false)
+            utils.pack_lib('tensorrt', mx_tensorrt_lib)
           }
         }
       }
     },
     'Build CPU windows':{
-      node('mxnetwindows-cpu') {
+      node(NODE_WINDOWS_CPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-cpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              init_git_win()
+              utils.init_git_win()
               powershell 'python ci/build_windows.py -f WIN_CPU'
               stash includes: 'windows_package.7z', name: 'windows_package_cpu'
             }
@@ -400,11 +328,11 @@ try {
     },
 
     'Build GPU windows':{
-      node('mxnetwindows-cpu') {
+      node(NODE_WINDOWS_CPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              init_git_win()
+              utils.init_git_win()
               powershell 'python ci/build_windows.py -f WIN_GPU'
               stash includes: 'windows_package.7z', name: 'windows_package_gpu'
             }
@@ -413,11 +341,11 @@ try {
       }
     },
     'Build GPU MKLDNN windows':{
-      node('mxnetwindows-cpu') {
+      node(NODE_WINDOWS_CPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-              init_git_win()
+              utils.init_git_win()
               powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
               stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
             }
@@ -426,61 +354,61 @@ try {
       }
     },
     'NVidia Jetson / ARMv8':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-jetson-armv8') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('jetson', 'build_jetson', false)
+            utils.init_git()
+            utils.docker_run('jetson', 'build_jetson', false)
           }
         }
       }
     },
     'ARMv7':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-ARMv7') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('armv7', 'build_armv7', false)
+            utils.init_git()
+            utils.docker_run('armv7', 'build_armv7', false)
           }
         }
       }
     },
     'ARMv6':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-ARMv6') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('armv6', 'build_armv6', false)
+            utils.init_git()
+            utils.docker_run('armv6', 'build_armv6', false)
           }
         }
       }
     },
     'ARMv8':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-ARMv8') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('armv8', 'build_armv8', false)
+            utils.init_git()
+            utils.docker_run('armv8', 'build_armv8', false)
           }
         }
       }
     },
     'Android / ARMv8':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/android64') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('android_armv8', 'build_android_armv8', false)
+            utils.init_git()
+            utils.docker_run('android_armv8', 'build_android_armv8', false)
           }
         }
       }
     },
     'Android / ARMv7':{
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/androidv7') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            docker_run('android_armv7', 'build_android_armv7', false)
+            utils.init_git()
+            utils.docker_run('android_armv7', 'build_android_armv7', false)
           }
         }
       }
@@ -490,434 +418,450 @@ try {
 
   stage('Tests') {
     parallel 'Python2: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-python2-cpu') {
           try {
-            init_git()
-            unpack_lib('cpu')
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
             python2_ut('ubuntu_cpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
-            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
-            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
+            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
+            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
+            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
           }
         }
       }
     },
     'Python3: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-python3-cpu') {
           try {
-            init_git()
-            unpack_lib('cpu')
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
             python3_ut('ubuntu_cpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
-            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
+            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
+            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
           }
         }
       }
     },
     'Python3: CPU debug': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-python3-cpu-debug') {
           try {
-            init_git()
-            unpack_lib('cpu_debug', mx_cmake_lib_debug)
+            utils.init_git()
+            utils.unpack_lib('cpu_debug', mx_cmake_lib_debug)
             python3_ut('ubuntu_cpu')
           } finally {
-            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
-            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
+            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
+            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
           }
         }
       }
     },
     'Python2: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-python2-gpu') {
           try {
-            init_git()
-            unpack_lib('gpu', mx_lib)
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
             python2_gpu_ut('ubuntu_gpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
+            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
           }
         }
       }
     },
     'Python3: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-gpu') {
           try {
-            init_git()
-            unpack_lib('gpu', mx_lib)
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
             python3_gpu_ut('ubuntu_gpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
+            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
           }
         }
       }
     },
     'Python2: Quantize GPU': {
-      node('mxnetlinux-gpu-p3') {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/ut-python2-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              init_git()
-              unpack_lib('gpu', mx_lib)
-              docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
-              publish_test_coverage()
+              utils.init_git()
+              utils.unpack_lib('gpu', mx_lib)
+              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
+              utils.publish_test_coverage()
             } finally {
-              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
+              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
             }
           }
         }
       }
     },
     'Python3: Quantize GPU': {
-      node('mxnetlinux-gpu-p3') {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/ut-python3-quantize-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              init_git()
-              unpack_lib('gpu', mx_lib)
-              docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
-              publish_test_coverage()
+              utils.init_git()
+              utils.unpack_lib('gpu', mx_lib)
+              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
+              utils.publish_test_coverage()
             } finally {
-              collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
+              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
             }
           }
         }
       }
     },
     'Python2: MKLDNN-CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-python2-mkldnn-cpu') {
           try {
-            init_git()
-            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
             python2_ut('ubuntu_cpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
-            collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
-            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
+            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
+            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
+            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
           }
         }
       }
     },
     'Python2: MKLDNN-GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-python2-mkldnn-gpu') {
           try {
-            init_git()
-            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
             python2_gpu_ut('ubuntu_gpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
+            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
           }
         }
       }
     },
     'Python3: MKLDNN-CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-python3-mkldnn-cpu') {
           try {
-            init_git()
-            unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
             python3_ut_mkldnn('ubuntu_cpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
-            collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
+            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
+            utils.collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
           }
         }
       }
     },
     'Python3: MKLDNN-GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-mkldnn-gpu') {
           try {
-            init_git()
-            unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+            utils.init_git()
+            utils.unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
             python3_gpu_ut('ubuntu_gpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
+            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
           }
         }
       }
     },
     'Python3: MKLDNN-GPU-NOCUDNN': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
           try {
-            init_git()
-            unpack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
+            utils.init_git()
+            utils.unpack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib)
             python3_gpu_ut_nocudnn('ubuntu_gpu')
-            publish_test_coverage()
+            utils.publish_test_coverage()
           } finally {
-            collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu_nocudnn.xml')
+            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu_nocudnn.xml')
           }
         }
       }
     },
     'Python3: CentOS 7 CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              init_git()
-              unpack_lib('centos7_cpu')
-              docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
-              publish_test_coverage()
+              utils.init_git()
+              utils.unpack_lib('centos7_cpu', mx_lib)
+              utils.docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
+              utils.publish_test_coverage()
             } finally {
-              collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
-              collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
+              utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
+              utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
             }
           }
         }
       }
     },
     'Python3: CentOS 7 GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/build-centos7-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              init_git()
-              unpack_lib('centos7_gpu')
-              docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
-              publish_test_coverage()
+              utils.init_git()
+              utils.unpack_lib('centos7_gpu', mx_lib)
+              utils.docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
+              utils.publish_test_coverage()
+            } finally {
+              utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
+            }
+          }
+        }
+      }
+    },
+    'Python3: TensorRT GPU': {
+      node(NODE_LINUX_GPU_P3) {
+        ws('workspace/build-tensorrt') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              utils.init_git()
+              utils.unpack_lib('tensorrt', mx_tensorrt_lib)
+              utils.docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
+              utils.publish_test_coverage()
             } finally {
-              collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
+              utils.collect_test_results_unix('nosetests_tensorrt.xml', 'nosetests_python3_tensorrt_gpu.xml')
             }
           }
         }
       }
     },
     'Scala: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu', mx_dist_lib)
-            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_dist_lib)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Clojure: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-clojure-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu', mx_dist_lib)
-            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_dist_lib)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Perl: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-perl-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu')
-            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Perl: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-perl-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu')
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_cpugpu_perl', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
+            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_cpugpu_perl', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Cpp: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-cpp-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cmake_gpu', mx_cmake_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cmake_gpu', mx_cmake_lib)
+            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Cpp: MKLDNN+GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-cpp-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'R: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/ut-r-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu')
-            docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'R: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-r-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu')
-            docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_R', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
+            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_R', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
 
     'Python 2: CPU Win':{
-      node('mxnetwindows-cpu') {
+      node(NODE_WINDOWS_CPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
             try {
-              init_git_win()
+              utils.init_git_win()
               unstash 'windows_package_cpu'
               powershell 'ci/windows/test_py2_cpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
+              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
             }
           }
         }
       }
     },
     'Python 3: CPU Win': {
-      node('mxnetwindows-cpu') {
+      node(NODE_WINDOWS_CPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-cpu') {
             try {
-              init_git_win()
+              utils.init_git_win()
               unstash 'windows_package_cpu'
               powershell 'ci/windows/test_py3_cpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
+              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
             }
           }
         }
       }
     },
     'Python 2: GPU Win':{
-      node('mxnetwindows-gpu') {
+      node(NODE_WINDOWS_GPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
             try {
-              init_git_win()
+              utils.init_git_win()
               unstash 'windows_package_gpu'
               powershell 'ci/windows/test_py2_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
-              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
+              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
+              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
             }
           }
         }
       }
     },
     'Python 3: GPU Win':{
-      node('mxnetwindows-gpu') {
+      node(NODE_WINDOWS_GPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
             try {
-              init_git_win()
+              utils.init_git_win()
               unstash 'windows_package_gpu'
               powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
-              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
+              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
+              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
             }
           }
         }
       }
     },
     'Python 3: MKLDNN-GPU Win':{
-      node('mxnetwindows-gpu') {
+      node(NODE_WINDOWS_GPU) {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/ut-python-gpu') {
             try {
-              init_git_win()
+              utils.init_git_win()
               unstash 'windows_package_gpu_mkldnn'
               powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
-              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
+              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
+              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
             }
           }
         }
       }
     },
     'Onnx CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/it-onnx-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu')
-            docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'Python GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/it-python-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu')
-            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_python', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
+            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_python', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     // Disabled due to: https://github.com/apache/incubator-mxnet/issues/11407
     // 'Caffe GPU': {
-    //   node('mxnetlinux-gpu') {
+    //   node(NODE_LINUX_GPU) {
     //     ws('workspace/it-caffe') {
     //       timeout(time: max_time, unit: 'MINUTES') {
-    //         init_git()
-    //         unpack_lib('gpu')
-    //         docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_caffe', true)
-    //         publish_test_coverage()
+    //         utils.init_git()
+    //         utils.unpack_lib('gpu', mx_lib)
+    //         utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_caffe', true)
+    //         utils.publish_test_coverage()
     //       }
     //     }
     //   }
     // },
     'cpp-package GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/it-cpp-package') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu')
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
             unstash 'cpp_lenet'
             unstash 'cpp_alexnet'
             unstash 'cpp_googlenet'
@@ -928,20 +872,20 @@ try {
             unstash 'cpp_mlp_gpu'
             unstash 'cpp_test_score'
             unstash 'cpp_test_optimizer'
-            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_cpp_package', true)
-            publish_test_coverage()
+            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_cpp_package', true)
+            utils.publish_test_coverage()
           }
         }
       }
     },
     'dist-kvstore tests GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/it-dist-kvstore') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu')
-            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_lib)
+            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
+            utils.publish_test_coverage()
           }
         }
       }
@@ -951,25 +895,25 @@ try {
      *  https://github.com/apache/incubator-mxnet/issues/11801
 
     'dist-kvstore tests CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/it-dist-kvstore') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('cpu')
-            docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', false)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('cpu', mx_lib)
+            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', false)
+            utils.publish_test_coverage()
           }
         }
       }
     }, */
     'Scala: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/ut-scala-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            init_git()
-            unpack_lib('gpu', mx_dist_lib)
-            docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_scala', true)
-            publish_test_coverage()
+            utils.init_git()
+            utils.unpack_lib('gpu', mx_dist_lib)
+            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_scala', true)
+            utils.publish_test_coverage()
           }
         }
       }
@@ -977,34 +921,22 @@ try {
   }
 
   stage('Deploy') {
-    node('mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/docs') {
         timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          docker_run('ubuntu_cpu', 'deploy_docs', false)
-          sh "tests/ci_build/deploy/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
+          utils.init_git()
+          utils.docker_run('ubuntu_cpu', 'deploy_docs', false)
+          sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
         }
       }
     }
   }
-
-  // set build status to success at the end
-  currentBuild.result = "SUCCESS"
-} catch (caughtError) {
-  node("mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("mxnetlinux-cpu") {
-    // Only send email if master or release branches failed
-    if (currentBuild.result == "FAILURE" && (env.BRANCH_NAME == "master" || env.BRANCH_NAME.startsWith("v"))) {
-      emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
+}
+,
+failure_handler: {
+  // Only send email if master or release branches failed
+  if (currentBuild.result == "FAILURE" && (env.BRANCH_NAME == "master" || env.BRANCH_NAME.startsWith("v"))) {
+    emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
+)
diff --git a/Makefile b/Makefile
index 18661aa69847..7aa7867f7c18 100644
--- a/Makefile
+++ b/Makefile
@@ -91,6 +91,14 @@ else
 endif
 CFLAGS += -I$(TPARTYDIR)/mshadow/ -I$(TPARTYDIR)/dmlc-core/include -fPIC -I$(NNVM_PATH)/include -I$(DLPACK_PATH)/include -I$(TPARTYDIR)/tvm/include -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
+
+
+ifeq ($(USE_TENSORRT), 1)
+	CFLAGS +=  -I$(ROOTDIR) -I$(TPARTYDIR) -DONNX_NAMESPACE=$(ONNX_NAMESPACE) -DMXNET_USE_TENSORRT=1
+	LDFLAGS += -lprotobuf -pthread -lonnx -lonnx_proto -lnvonnxparser -lnvonnxparser_runtime -lnvinfer -lnvinfer_plugin
+endif
+# -L/usr/local/lib
+
 ifeq ($(DEBUG), 1)
 	NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
@@ -526,7 +534,7 @@ cpplint:
 	--exclude_path src/operator/contrib/ctc_include
 
 pylint:
-	pylint --rcfile=$(ROOTDIR)/tests/ci_build/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
+	pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 doc: docs
 
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 52d775b76925..a3c28f7118e9 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -23,13 +23,12 @@
 import platform
 
 blacklist = [
-    'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
-    'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h',
-    'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
-    'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
-    'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
-    'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
-    'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
+    'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh', 'cuda_runtime.h', 'cudnn.h',
+    'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h', 'glog/logging.h', 'io/azure_filesys.h',
+    'io/hdfs_filesys.h', 'io/s3_filesys.h', 'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
+    'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h', 'NvInfer.h', 'nvml.h',
+    'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h', 'omp.h',
+    'onnx/onnx.pb.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
     'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
     'relacy_shims.h', 'ittnotify.h', 'shared_mutex'
     ]
@@ -150,6 +149,7 @@ def expand(x, pending, stage):
                     h not in sysheaders and
                     'mkl' not in h and
                     'nnpack' not in h and
+                    'tensorrt' not in h and
                     not h.endswith('.cuh')): sysheaders.append(h)
             else:
                 expand.treeDepth += 1
diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 627f375e1411..ca72e9affaa1 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -26,6 +26,7 @@
 import os
 import sys
 import ctypes
+import logging
 import numpy as np
 
 __all__ = ["Predictor", "load_ndarray_file"]
@@ -51,15 +52,25 @@ def c_array(ctype, values):
 def _find_lib_path():
     """Find mxnet library."""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    api_path = os.path.join(curr_path, '../../lib/')
-    dll_path = [curr_path, api_path]
-    dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path] + \
-        [os.path.join(p, 'libmxnet_predict.so') for p in dll_path]
-    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(lib_path) == 0:
-        raise RuntimeError('Cannot find the files.\n' +
-                           'List of candidates:\n' + str('\n'.join(dll_path)))
-    return lib_path
+    amalgamation_lib_path = os.path.join(curr_path, '../../lib/libmxnet_predict.so')
+    if os.path.exists(amalgamation_lib_path) and os.path.isfile(amalgamation_lib_path):
+        lib_path = [amalgamation_lib_path]
+        return lib_path
+    else:
+        logging.info('Cannot find libmxnet_predict.so. Will search for MXNet library using libinfo.py then.')
+        try:
+            from mxnet.libinfo import find_lib_path
+            lib_path = find_lib_path()
+            return lib_path
+        except ImportError:
+            libinfo_path = os.path.join(curr_path, '../../python/mxnet/libinfo.py')
+            if os.path.exists(libinfo_path) and os.path.isfile(libinfo_path):
+                libinfo = {'__file__': libinfo_py}
+                exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+                lib_path = libinfo['find_lib_path']()
+                return lib_path
+            else:
+                raise RuntimeError('Cannot find libinfo.py at %s.' % libinfo_path)
 
 
 def _load_lib():
@@ -159,6 +170,39 @@ def forward(self, **kwargs):
                 mx_uint(v.size)))
         _check_call(_LIB.MXPredForward(self.handle))
 
+    def reshape(self, input_shapes):
+        """Change the input shape of the predictor.
+
+        Parameters
+        ----------
+        input_shapes : dict of str to tuple
+            The new shape of input data.
+
+        Examples
+        --------
+        >>> predictor.reshape({'data':data_shape_tuple})
+        """
+        indptr = [0]
+        sdata = []
+        keys = []
+        for k, v  in input_shapes.items():
+            if not isinstance(v, tuple):
+                raise ValueError("Expect input_shapes to be dict str->tuple")
+            keys.append(c_str(k))
+            sdata.extend(v)
+            indptr.append(len(sdata))
+
+        new_handle = PredictorHandle()
+        _check_call(_LIB.MXPredReshape(
+            mx_uint(len(indptr) - 1),
+            c_array(ctypes.c_char_p, keys),
+            c_array(mx_uint, indptr),
+            c_array(mx_uint, sdata),
+            self.handle,
+            ctypes.byref(new_handle)))
+        _check_call(_LIB.MXPredFree(self.handle))
+        self.handle = new_handle
+
     def get_output(self, index):
         """Get the index-th output.
 
diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache
index 550425bb932a..77f0122f944e 100644
--- a/ci/Jenkinsfile_docker_cache
+++ b/ci/Jenkinsfile_docker_cache
@@ -22,60 +22,33 @@
 
 // timeout in minutes
 total_timeout = 300
-git_timeout = 15
-// assign any caught errors here
-err = null
 
-// initialize source codes
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
-      // retries as this will increase the amount of requests and worsen the throttling
-      timeout(time: git_timeout, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -x -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
+node('restricted-mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
-
-try {
+utils.main_wrapper(
+core_logic: {
   stage("Docker cache build & publish") {
-    node('restricted-mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/docker_cache') {
         timeout(time: total_timeout, unit: 'MINUTES') {
-          init_git()
+          utils.init_git()
           sh "ci/docker_cache.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}"
         }
       }
     }
   }
-
-  // set build status to success at the end
-  currentBuild.result = "SUCCESS"
-} catch (caughtError) {
-  node("restricted-mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("restricted-mxnetlinux-cpu") {
-    // Only send email if master failed
-    if (currentBuild.result == "FAILURE") {
-      emailext body: 'Generating the Docker Cache has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[DOCKER CACHE FAILED] Run ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
+}
+,
+failure_handler:
+{
+  if (currentBuild.result == "FAILURE") {
+    emailext body: 'Generating the Docker Cache has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[DOCKER CACHE FAILED] Run ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
+)
+
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
new file mode 100644
index 000000000000..dfa2519bd0e5
--- /dev/null
+++ b/ci/Jenkinsfile_utils.groovy
@@ -0,0 +1,153 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// initialize source codes
+def init_git() {
+  deleteDir()
+  retry(5) {
+    try {
+      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
+      // retries as this will increase the amount of requests and worsen the throttling
+      timeout(time: 15, unit: 'MINUTES') {
+        checkout scm
+        sh 'git submodule update --init --recursive'
+        sh 'git clean -xdff'
+      }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
+    }
+  }
+}
+
+def init_git_win() {
+  deleteDir()
+  retry(5) {
+    try {
+      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
+      // retries as this will increase the amount of requests and worsen the throttling
+      timeout(time: 15, unit: 'MINUTES') {
+        checkout scm
+        bat 'git submodule update --init --recursive'
+        bat 'git clean -xdff'
+      }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
+    }
+  }
+}
+
+// pack libraries for later use
+def pack_lib(name, libs) {
+  sh """
+echo "Packing ${libs} into ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+  stash includes: libs, name: name
+}
+
+// unpack libraries saved before
+def unpack_lib(name, libs) {
+  unstash name
+  sh """
+echo "Unpacked ${libs} from ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+}
+
+def publish_test_coverage() {
+    // Fall back to our own copy of the bash helper if it failed to download the public version
+    sh '(curl --retry 10 -s https://codecov.io/bash | bash -s -) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s -)'
+}
+
+def collect_test_results_unix(original_file_name, new_file_name) {
+    if (fileExists(original_file_name)) {
+        // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+        // Thus, we have to pick a name manually and rename the files so that they can be stored separately.
+        sh 'cp ' + original_file_name + ' ' + new_file_name
+        archiveArtifacts artifacts: new_file_name
+    }
+}
+
+def collect_test_results_windows(original_file_name, new_file_name) {
+    // Rename file to make it distinguishable. Unfortunately, it's not possible to get STAGE_NAME in a parallel stage
+    // Thus, we have to pick a name manually and rename the files so that they can be stored separately.
+    if (fileExists(original_file_name)) {
+        bat 'xcopy ' + original_file_name + ' ' + new_file_name + '*'
+        archiveArtifacts artifacts: new_file_name
+    }
+}
+
+
+def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
+  command = command.replaceAll('%PLATFORM%', platform)
+  command = command.replaceAll('%FUNCTION_NAME%', function_name)
+  command = command.replaceAll('%SHARED_MEM%', shared_mem)
+
+  sh command
+}
+
+
+
+def assign_node_labels(args) {
+  NODE_LINUX_CPU = args.linux_cpu
+  NODE_LINUX_GPU = args.linux_gpu
+  NODE_LINUX_GPU_P3 = args.linux_gpu_p3
+  NODE_WINDOWS_CPU = args.windows_cpu
+  NODE_WINDOWS_GPU = args.windows_gpu
+}
+
+def main_wrapper(args) {
+  // Main Jenkinsfile pipeline wrapper handler that allows to wrap core logic into a format
+  // that supports proper failure handling
+  // args:
+  // - core_logic: Jenkins pipeline containing core execution logic
+  // - failure_handler: Failure handler
+  
+  // assign any caught errors here
+  err = null
+  try {
+    args['core_logic']()
+
+    // set build status to success at the end
+    currentBuild.result = "SUCCESS"
+  } catch (caughtError) {
+    node(NODE_LINUX_CPU) {
+      sh "echo caught ${caughtError}"
+      err = caughtError
+      currentBuild.result = "FAILURE"
+    }
+  } finally {
+    node(NODE_LINUX_CPU) {
+      // Call failure handler
+      args['failure_handler']()
+      
+      // Remember to rethrow so the build is marked as failing
+      if (err) {
+        throw err
+      }
+    }
+  }
+}
+return this
diff --git a/ci/build.py b/ci/build.py
index a9d6a63537f2..0a1ad4cf5751 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -43,6 +43,43 @@
 
 CCACHE_MAXSIZE = '500G'
 
+
+
+def retry(ExceptionToCheck, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param ExceptionToCheck: the exception to check. may be a tuple of
+        exceptions to check
+    :type ExceptionToCheck: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except ExceptionToCheck as e:
+                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+        return f_retry  # true decorator
+    return decorated_retry
+
 def under_ci() -> bool:
     """:return: True if we run in Jenkins."""
     return 'JOB_NAME' in os.environ
@@ -77,9 +114,8 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
     :param num_retries: Number of retries to build the docker image
     :return: Id of the top level image
     """
-
     tag = get_docker_tag(platform=platform, registry=registry)
-    logging.info("Building container tagged '%s' with %s", tag, docker_binary)
+    logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
     #
     # We add a user with the same group as the executing non-root user so files created in the
     # container match permissions of the local user. Same for the group.
@@ -91,40 +127,24 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
     # docker pull see: docker_cache.load_docker_cache
     #
     # This doesn't work with multi head docker files.
-    # 
-
-    for i in range(num_retries):
-        logging.info('%d out of %d tries to build the docker image.', i + 1, num_retries)
-
-        cmd = [docker_binary, "build",
-               "-f", get_dockerfile(platform),
-               "--build-arg", "USER_ID={}".format(os.getuid()),
-               "--build-arg", "GROUP_ID={}".format(os.getgid()),
-               "--cache-from", tag,
-               "-t", tag,
-               "docker"]
+    #
+    cmd = [docker_binary, "build",
+           "-f", get_dockerfile(platform),
+           "--build-arg", "USER_ID={}".format(os.getuid()),
+           "--build-arg", "GROUP_ID={}".format(os.getgid()),
+           "--cache-from", tag,
+           "-t", tag,
+           "docker"]
+
+    @retry(subprocess.CalledProcessError, tries=num_retries)
+    def run_cmd():
         logging.info("Running command: '%s'", ' '.join(cmd))
-        try:
-            check_call(cmd)
-            # Docker build was successful. Call break to break out of the retry mechanism
-            break
-        except subprocess.CalledProcessError as e:
-            saved_exception = e
-            logging.error('Failed to build docker image')
-            # Building the docker image failed. Call continue to trigger the retry mechanism
-            continue
-    else:
-        # Num retries exceeded
-        logging.exception('Exception during build of docker image', saved_exception)
-        logging.fatal('Failed to build the docker image, aborting...')
-        sys.exit(1)
+        check_call(cmd)
 
+    run_cmd()
     # Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
     # check_call would have failed
-    image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
-    if not image_id:
-        raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
-    return image_id
+    return _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
 
 
 def _get_local_image_id(docker_binary, docker_tag):
@@ -136,6 +156,8 @@ def _get_local_image_id(docker_binary, docker_tag):
     cmd = [docker_binary, "images", "-q", docker_tag]
     image_id_b = subprocess.check_output(cmd)
     image_id = image_id_b.decode('utf-8').strip()
+    if not image_id:
+        raise RuntimeError('Unable to find docker image id matching with tag {}'.format(tag))
     return image_id
 
 
@@ -186,7 +208,7 @@ def container_run(platform: str,
                '-e', "CCACHE_LOGFILE=/tmp/ccache.log",  # a container-scoped log, useful for ccache verification.
                tag]
     runlist.extend(command)
-    cmd = '\\\n\t'.join(runlist)
+    cmd = ' \\\n\t'.join(runlist)
     ret = 0
     if not dry_run and not interactive:
         logging.info("Running %s in container %s", command, tag)
@@ -199,14 +221,14 @@ def container_run(platform: str,
         # -ti can't be after the tag, as is interpreted as a command so hook it up after the -u argument
         idx = into_cmd.index('-u') + 2
         into_cmd[idx:idx] = ['-ti']
-        cmd = '\\\n\t'.join(into_cmd)
+        cmd = ' \\\n\t'.join(into_cmd)
         logging.info("Executing:\n%s\n", cmd)
         docker_run_cmd = ' '.join(into_cmd)
         ret = call(into_cmd)
 
     if not dry_run and not interactive and ret != 0:
         logging.error("Running of command in container failed (%s):\n%s\n", ret, cmd)
-        logging.error("You can get into the container by adding the -i option")
+        logging.error("You can get into the container by adding the -i option to this script")
         raise subprocess.CalledProcessError(ret, cmd)
 
     return docker_run_cmd
@@ -303,7 +325,6 @@ def use_cache():
     command = list(chain(*args.command))
     docker_binary = get_docker_binary(args.nvidiadocker)
     shared_memory_size = args.shared_memory_size
-    num_docker_build_retires = args.docker_build_retries
 
     if args.list:
         list_platforms()
@@ -312,7 +333,7 @@ def use_cache():
         tag = get_docker_tag(platform=platform, registry=args.docker_registry)
         if use_cache():
             load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-        build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=num_docker_build_retires)
+        build_docker(platform, docker_binary, registry=args.docker_registry, num_retries=args.docker_build_retries)
         if args.build_only:
             logging.warning("Container was just built. Exiting due to build-only.")
             return 0
@@ -346,7 +367,7 @@ def use_cache():
             tag = get_docker_tag(platform=platform, registry=args.docker_registry)
             if use_cache():
                 load_docker_cache(tag=tag, docker_registry=args.docker_registry)
-            build_docker(platform, docker_binary, args.docker_registry, num_retries=num_docker_build_retires)
+            build_docker(platform, docker_binary, args.docker_registry, num_retries=args.docker_build_retries)
             if args.build_only:
                 continue
             build_platform = "build_{}".format(platform)
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.centos7_cpu b/ci/docker/Dockerfile.build.centos7_cpu
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.ubuntu_base_cpu b/ci/docker/Dockerfile.build.ubuntu_base_cpu
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.ubuntu_base_gpu b/ci/docker/Dockerfile.build.ubuntu_base_gpu
old mode 100755
new mode 100644
diff --git a/ci/docker/Dockerfile.build.ubuntu_blc b/ci/docker/Dockerfile.build.ubuntu_blc
old mode 100755
new mode 100644
index 294740ce1392..208cba2111f6
--- a/ci/docker/Dockerfile.build.ubuntu_blc
+++ b/ci/docker/Dockerfile.build.ubuntu_blc
@@ -24,8 +24,13 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
+
 COPY install/ubuntu_npm_blc.sh /work/
 RUN /work/ubuntu_npm_blc.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
old mode 100755
new mode 100644
index 9ed0cbbe3e52..19e9265f88d0
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -27,20 +27,30 @@ WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
 RUN /work/ubuntu_core.sh
+
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
+
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
 RUN /work/ubuntu_scala.sh
+
 COPY install/ubuntu_r.sh /work/
 COPY install/r.gpg /work/
 RUN /work/ubuntu_r.sh
+
 COPY install/ubuntu_perl.sh /work/
 RUN /work/ubuntu_perl.sh
+
 COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
+
 COPY install/ubuntu_mklml.sh /work/
 RUN /work/ubuntu_mklml.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
old mode 100755
new mode 100644
index 58a8e9a50d7d..08fb04df03e2
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -28,8 +28,11 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
 
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
@@ -58,6 +61,7 @@ COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
 COPY install/ubuntu_docs.sh /work/
+COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
 
 ARG USER_ID=0
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu b/ci/docker/Dockerfile.build.ubuntu_gpu
old mode 100755
new mode 100644
index de38948e623b..d99dafb0bd0c
--- a/ci/docker/Dockerfile.build.ubuntu_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu
@@ -28,8 +28,11 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
 
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
@@ -61,6 +64,7 @@ COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
 COPY install/ubuntu_docs.sh /work/
+COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
 
 COPY install/ubuntu_tutorials.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
new file mode 100644
index 000000000000..3f0bbc666f25
--- /dev/null
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
@@ -0,0 +1,47 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for CPU
+
+FROM nvidia/cuda:9.0-cudnn7-devel
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_core.sh /work/
+RUN /work/ubuntu_core.sh
+
+COPY install/deb_ubuntu_ccache.sh /work/
+RUN /work/deb_ubuntu_ccache.sh
+
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
+
+COPY install/tensorrt.sh /work
+RUN /work/tensorrt.sh
+
+ARG USER_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
old mode 100755
new mode 100644
index 58ff33e3013b..834710c4ceb6
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_cpu
@@ -28,8 +28,11 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
 
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
@@ -55,6 +58,7 @@ COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
 COPY install/ubuntu_docs.sh /work/
+COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
 
 COPY install/ubuntu_nightly_tests.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
old mode 100755
new mode 100644
index 017bade1775d..fb34307063a0
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
@@ -28,8 +28,11 @@ RUN /work/ubuntu_core.sh
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
-COPY install/ubuntu_python.sh /work/
-RUN /work/ubuntu_python.sh
+COPY install/ubuntu_python2.sh /work/
+RUN /work/ubuntu_python2.sh
+
+COPY install/ubuntu_python3.sh /work/
+RUN /work/ubuntu_python3.sh
 
 COPY install/ubuntu_scala.sh /work/
 COPY install/sbt.gpg /work/
@@ -61,6 +64,7 @@ COPY install/ubuntu_onnx.sh /work/
 RUN /work/ubuntu_onnx.sh
 
 COPY install/ubuntu_docs.sh /work/
+COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
 
 COPY install/ubuntu_tutorials.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_rat b/ci/docker/Dockerfile.build.ubuntu_rat
old mode 100755
new mode 100644
diff --git a/docs/build_version_doc/requirements.txt b/ci/docker/install/docs_requirements
similarity index 51%
rename from docs/build_version_doc/requirements.txt
rename to ci/docker/install/docs_requirements
index 4f3f4d065c9b..7407223b3eed 100644
--- a/docs/build_version_doc/requirements.txt
+++ b/ci/docker/install/docs_requirements
@@ -1,15 +1,16 @@
-beautifulsoup4
-breathe
+beautifulsoup4==4.6.3
+breathe==4.10.0
 cpplint==1.3.0
 CommonMark==0.5.4
 h5py==2.8.0rc1
-mock==1.0.1
-nose
-nose-timer
+mock==2.0.0
+nose==1.3.7
+nose-timer==0.7.3
 numpy<1.15.0,>=1.8.2
 pylint==1.8.3
-pypandoc
-recommonmark==0.4.0 
+pypandoc==1.4
+recommonmark==0.4.0
 requests<2.19.0,>=2.18.4
 scipy==1.0.1
+six==1.11.0
 sphinx==1.5.6
diff --git a/ci/docker/install/tensorrt.sh b/ci/docker/install/tensorrt.sh
new file mode 100755
index 000000000000..a6258d94f62f
--- /dev/null
+++ b/ci/docker/install/tensorrt.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Install gluoncv since we're testing Gluon models as well
+pip2 install gluoncv==0.2.0
+pip3 install gluoncv==0.2.0
+
+# Install Protobuf
+# Install protoc 3.5 and build protobuf here (for onnx and onnx-tensorrt)
+pushd .
+cd ..
+apt-get update
+apt-get install -y automake libtool
+git clone --recursive -b 3.5.1.1 https://github.com/google/protobuf.git
+cd protobuf
+./autogen.sh
+./configure
+make -j$(nproc)
+make install
+ldconfig
+popd
+
+# Install TensorRT
+echo "TensorRT build enabled. Installing TensorRT."
+wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0_1-1_amd64.deb
+dpkg -i tensorrt.deb
+apt-get update
+apt-get install -y --allow-downgrades libnvinfer-dev
+rm tensorrt.deb
diff --git a/ci/docker/install/ubuntu_docs.sh b/ci/docker/install/ubuntu_docs.sh
index ee121962ee08..a709b3de7843 100755
--- a/ci/docker/install/ubuntu_docs.sh
+++ b/ci/docker/install/ubuntu_docs.sh
@@ -27,15 +27,7 @@ apt-get install -y \
     doxygen \
     pandoc
 
-echo 'Installing python packages...'
-pip install --upgrade pip && pip install \
-    beautifulsoup4 \
-    breathe \
-    CommonMark==0.5.4 \
-    h5py \
-    mock==1.0.1 \
-    pypandoc \
-    recommonmark==0.4.0 \
-    sphinx==1.5.6
+pip3 install -r /work/docs_requirements
+pip2 install -r /work/docs_requirements
 
 echo 'Dependency installation complete.'
diff --git a/ci/docker/install/ubuntu_nightly_tests.sh b/ci/docker/install/ubuntu_nightly_tests.sh
index df56cf5a980b..0e6b437a1d89 100755
--- a/ci/docker/install/ubuntu_nightly_tests.sh
+++ b/ci/docker/install/ubuntu_nightly_tests.sh
@@ -30,3 +30,8 @@ apt-get -y install time
 
 # Install for RAT License Check Nightly Test
 apt-get install -y subversion maven -y #>/dev/null
+
+# Packages needed for the Straight Dope Nightly tests.
+pip2 install pandas
+pip3 install pandas
+
diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python2.sh
similarity index 84%
rename from ci/docker/install/ubuntu_python.sh
rename to ci/docker/install/ubuntu_python2.sh
index e71cac8a3898..f0526e2d8300 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python2.sh
@@ -22,12 +22,10 @@
 
 set -ex
 # install libraries for mxnet's python package on ubuntu
-apt-get install -y python-dev python3-dev virtualenv
+apt-get install -y python-dev virtualenv wget
 
 # the version of the pip shipped with ubuntu may be too lower, install a recent version here
 wget -nv https://bootstrap.pypa.io/get-pip.py
-python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 mock
diff --git a/ci/docker/install/ubuntu_python3.sh b/ci/docker/install/ubuntu_python3.sh
new file mode 100755
index 000000000000..1dad5a7aa207
--- /dev/null
+++ b/ci/docker/install/ubuntu_python3.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+set -ex
+# install libraries for mxnet's python package on ubuntu
+apt-get install -y python3-dev virtualenv wget
+
+# the version of the pip shipped with ubuntu may be too lower, install a recent version here
+wget -nv https://bootstrap.pypa.io/get-pip.py
+python3 get-pip.py
+
+pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 mock
diff --git a/ci/docker/install/ubuntu_rat.sh b/ci/docker/install/ubuntu_rat.sh
index 94596ef011f0..b131a0bb5586 100755
--- a/ci/docker/install/ubuntu_rat.sh
+++ b/ci/docker/install/ubuntu_rat.sh
@@ -24,10 +24,11 @@ apt-get update
 apt-get install -y subversion maven openjdk-8-jdk openjdk-8-jre
 
 echo "download RAT"
-svn co http://svn.apache.org/repos/asf/creadur/rat/trunk/
+#svn co http://svn.apache.org/repos/asf/creadur/rat/trunk/
+svn co http://svn.apache.org/repos/asf/creadur/rat/branches/0.12-release/
 
 echo "cd into directory"
-cd trunk
+cd 0.12-release
 
 echo "mvn install"
 mvn -Dmaven.test.skip=true install
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1c861beb916c..e4aac8b18bdd 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -414,6 +414,60 @@ build_ubuntu_gpu() {
     build_ubuntu_gpu_cuda91_cudnn7
 }
 
+build_ubuntu_gpu_tensorrt() {
+
+    set -ex
+
+    build_ccache_wrappers
+
+    # Build ONNX
+    pushd .
+    echo "Installing ONNX."
+    cd 3rdparty/onnx-tensorrt/third_party/onnx
+    rm -rf build
+    mkdir -p build
+    cd build
+    cmake \
+        -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
+        -DBUILD_SHARED_LIBS=ON ..\
+        -G Ninja
+    ninja -v
+    export LIBRARY_PATH=`pwd`:`pwd`/onnx/:$LIBRARY_PATH
+    export CPLUS_INCLUDE_PATH=`pwd`:$CPLUS_INCLUDE_PATH
+    popd
+
+    # Build ONNX-TensorRT
+    pushd .
+    cd 3rdparty/onnx-tensorrt/
+    mkdir -p build
+    cd build
+    cmake ..
+    make -j$(nproc)
+    export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
+    popd
+
+    mkdir -p /work/mxnet/lib/
+    cp 3rdparty/onnx-tensorrt/third_party/onnx/build/*.so /work/mxnet/lib/
+    cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser_runtime.so.0 /work/mxnet/lib/
+    cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser.so.0 /work/mxnet/lib/
+
+    rm -rf build
+    make \
+        DEV=1                                               \
+        USE_BLAS=openblas                                   \
+        USE_CUDA=1                                          \
+        USE_CUDA_PATH=/usr/local/cuda                       \
+        USE_CUDNN=1                                         \
+        USE_OPENCV=0                                        \
+        USE_DIST_KVSTORE=0                                  \
+        USE_TENSORRT=1                                      \
+        USE_JEMALLOC=0                                      \
+        USE_GPERFTOOLS=0                                    \
+        ONNX_NAMESPACE=onnx                                 \
+        CUDA_ARCH="-gencode arch=compute_70,code=compute_70"\
+        -j$(nproc)
+}
+
 build_ubuntu_gpu_mkldnn() {
     set -ex
 
@@ -527,9 +581,7 @@ sanity_check() {
 unittest_ubuntu_python2_cpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
@@ -539,9 +591,7 @@ unittest_ubuntu_python2_cpu() {
 unittest_ubuntu_python3_cpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
@@ -550,9 +600,7 @@ unittest_ubuntu_python3_cpu() {
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
@@ -561,9 +609,7 @@ unittest_ubuntu_python3_cpu_mkldnn() {
 unittest_ubuntu_python2_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
@@ -595,9 +641,7 @@ tutorialtest_ubuntu_python2_gpu() {
 unittest_ubuntu_python3_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
@@ -610,14 +654,21 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
+unittest_ubuntu_tensorrt_gpu() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
+    python tests/python/tensorrt/lenet5_train.py
+    nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose tests/python/tensorrt/
+}
+
 # quantization gpu currently only runs on P3 instances
 # need to separte it from unittest_ubuntu_python2_gpu()
 unittest_ubuntu_python2_quantization_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
@@ -627,9 +678,7 @@ unittest_ubuntu_python2_quantization_gpu() {
 unittest_ubuntu_python3_quantization_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
-    # https://github.com/apache/incubator-mxnet/issues/10026
-    #export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
@@ -788,11 +837,14 @@ build_docs() {
     pushd .
     cd /work/mxnet/docs/build_version_doc
     # Parameters are set in the Jenkins pipeline: restricted-website-build
-    # $1 is the list of branches to build; $2 is the list of tags to display
+    # $1: the list of branches/tags to build
+    # $2: the list of tags to display
     # So you can build from the 1.2.0 branch, but display 1.2.1 on the site
-    ./build_all_version.sh $1 $2
-    # $3 is the default version tag for the website; $4 is the base URL
-    ./update_all_version.sh $2 $3 $4
+    # $3: the fork URL
+    ./build_all_version.sh $1 $2 $3
+    # $4: the default version tag for the website
+    # $5: the base URL
+    ./update_all_version.sh $2 $4 $5
     cd VersionedWeb
     tar -zcvf ../artifacts.tgz .
     popd
@@ -806,7 +858,7 @@ nightly_test_rat_check() {
     set -e
     pushd .
 
-    cd /work/deps/trunk/apache-rat/target
+    cd /work/deps/0.12-release/apache-rat/target
 
     # Use shell number 5 to duplicate the log output. It get sprinted and stored in $OUTPUT at the same time https://stackoverflow.com/a/12451419
     exec 5>&1
@@ -942,6 +994,7 @@ broken_link_checker() {
     ./tests/nightly/broken_link_checker_test/broken_link_checker.sh
 }
 
+
 ##############################################################
 # MAIN
 #
@@ -961,3 +1014,5 @@ EOF
     declare -F | cut -d' ' -f3
     echo
 fi
+
+
diff --git a/tests/ci_build/deploy/ci_deploy_doc.sh b/ci/other/ci_deploy_doc.sh
similarity index 100%
rename from tests/ci_build/deploy/ci_deploy_doc.sh
rename to ci/other/ci_deploy_doc.sh
diff --git a/tests/ci_build/pylintrc b/ci/other/pylintrc
similarity index 100%
rename from tests/ci_build/pylintrc
rename to ci/other/pylintrc
diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py
old mode 100644
new mode 100755
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index 1623d2956103..aa38b81e392e 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -16,6 +16,7 @@
 # under the License.
 
 7z x -y windows_package.7z
+$env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index 13cd5366e0db..5f8de5ac4f9e 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -16,6 +16,7 @@
 # under the License.
 
 7z x -y windows_package.7z
+$env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index 98d4e410e8f5..0dd48de26b32 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -16,6 +16,7 @@
 # under the License.
 
 7z x -y windows_package.7z
+$env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index b94b4f389be8..4a0feb1ede88 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -16,6 +16,7 @@
 # under the License.
 
 7z x -y windows_package.7z
+$env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 1c825c1502af..a25824cad602 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -178,6 +178,8 @@ class Symbol {
   std::vector<std::string> ListOutputs() const;
   /*! \return get the descriptions of auxiliary data for this symbol */
   std::vector<std::string> ListAuxiliaryStates() const;
+  /*! \return get the name of the symbol */
+  std::string GetName() const;
   /*!
   * \brief infer and construct all the arrays to bind to executor by providing
   * some known arrays.
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index 11590fad6041..b82e060ca8da 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -172,6 +172,14 @@ inline std::vector<std::string> Symbol::ListAuxiliaryStates() const {
   return ret;
 }
 
+inline std::string Symbol::GetName() const {
+  int success;
+  const char* out_name;
+  CHECK_EQ(MXSymbolGetName(GetHandle(), &out_name, &success), 0);
+  CHECK_EQ(success, 1);
+  return std::string(out_name);
+}
+
 inline void Symbol::InferShape(
     const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
     std::vector<std::vector<mx_uint> > *in_shape,
diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile
index ef0755faac7c..e20d984f0b5f 100644
--- a/docs/Jenkinsfile
+++ b/docs/Jenkinsfile
@@ -21,61 +21,34 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 60
-// assign any caught errors here
-err = null
+max_time = 120
 
-// initialize source code
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
-      // retries as this will increase the amount of requests and worsen the throttling
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
+node('restricted-mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
-try {
+utils.main_wrapper(
+core_logic: {
   stage('Build Docs') {
-    node('restricted-mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/docs') {
-        init_git()
+        utils.init_git()
         timeout(time: max_time, unit: 'MINUTES') {
-            sh "ci/build.py -p ubuntu_cpu --docker-registry ${env.DOCKER_CACHE_REGISTRY} --docker-build-retries 3 /work/runtime_functions.sh build_docs ${params.tags_to_build} ${params.tag_list} ${params.tag_default} ${params.domain}"
+            sh "ci/build.py -p ubuntu_cpu --docker-registry ${env.DOCKER_CACHE_REGISTRY} --docker-build-retries 3 /work/runtime_functions.sh build_docs ${params.tags_to_build} ${params.tag_list} ${params.fork} ${params.tag_default} ${params.domain}"
             archiveArtifacts 'docs/build_version_doc/artifacts.tgz'
             build 'restricted-website-publish'
         }
       }
     }
   }
-
-  // set build status to success at the end
-  currentBuild.result = "SUCCESS"
-} catch (caughtError) {
-  node("restricted-mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("restricted-mxnetlinux-cpu") {
-    // Only send email if master failed
-    if (currentBuild.result == "FAILURE") {
-      emailext body: 'Generating the website has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[WEBSITE FAILED] Build ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
+}
+,
+failure_handler: {
+  if (currentBuild.result == "FAILURE") {
+    emailext body: 'Generating the website has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[WEBSITE FAILED] Build ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
+)
diff --git a/docs/Jenkinsfile-dev b/docs/Jenkinsfile-dev
new file mode 100644
index 000000000000..169ebe13e4b5
--- /dev/null
+++ b/docs/Jenkinsfile-dev
@@ -0,0 +1,54 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// timeout in minutes
+max_time = 120
+
+node('mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
+}
+utils.assign_node_labels(linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
+
+utils.main_wrapper(
+core_logic: {
+  stage('Build Docs') {
+    node(NODE_LINUX_CPU) {
+      ws('workspace/docs') {
+        utils.init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+            sh "ci/build.py -p ubuntu_cpu --docker-registry ${env.DOCKER_CACHE_REGISTRY} --docker-build-retries 3 /work/runtime_functions.sh build_docs ${params.tags_to_build} ${params.tag_list} ${params.fork} ${params.tag_default} ${params.domain}"
+            archiveArtifacts 'docs/build_version_doc/artifacts.tgz'
+            build 'test-website-publish'
+        }
+      }
+    }
+  }
+}
+,
+failure_handler: {
+  if (currentBuild.result == "FAILURE") {
+    // Do nothing.
+  }
+}
+)
diff --git a/docs/Makefile b/docs/Makefile
index 4673d0ed1b5d..90603b0875fc 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -72,6 +72,8 @@ livehtml:
 	sphinx-autobuild --ignore "web-data/*" -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 
 html:
+	export BUILD_VER=$(BUILD_VER)
+	@echo "Env var set for BUILD_VER: $(BUILD_VER)"
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js
index 00196cefe34f..549f10ef4ec7 100644
--- a/docs/_static/js/sidebar.js
+++ b/docs/_static/js/sidebar.js
@@ -1,5 +1,5 @@
 /*Preprocess*/
-var LANG = ['python', 'scala', 'clojure', 'r', 'julia', 'c++', 'perl'];
+var LANG = ['python', 'c++', 'clojure', 'julia', 'perl', 'r', 'scala'];
 var TITLE_WITH_LANG = ['/tutorials/', '/faq/', '/architecture/', '/community/'];
 for(var i = 0; i < LANG.length; ++i) {
     TITLE_WITH_LANG.push('/api/' + LANG[i] + '/');
@@ -27,23 +27,27 @@ function render_left_helper(toc) {
 /*Render content tree of different pages*/
 function render_lefttoc() {
     var url = window.location.href, indexTrailing = 'index.html';
+    if (url.indexOf('/get_started/') != -1) {
+        var leftToc = "<ul><li class='leaf'><a href='install.html'>Installation</a></li><li class='leaf'><a href='why_mxnet.html'>Why MXNet</a></li></ul>";
+        render_left_helper($($.parseHTML(leftToc)), 'Get Started');
+        keepExpand();
+        $('.sphinxsidebar').css("visibility", "visible");
+        return;
+    }
     // If current page is not index page
     if (url.indexOf(indexTrailing) == -1) {
         for(var i = 0; i < TITLE_WITH_LANG.length; ++i) {
             var path = TITLE_WITH_LANG[i];
-            // if it is an API page
             if (url.indexOf(path) != -1) {
                 urlElem = url.split('/');
                 version = '';
                 for (var j = 0; j < urlElem.length; ++j) {
                     if(urlElem[j] == 'versions') {
-                        // get what version this is
                         version = '/versions/' + urlElem[j + 1];
                         break;
                     }
                 }
                 var protocol = location.protocol.concat("//");
-                // regenerate the url with the version in it
                 var urlPath = protocol + window.location.host + version +  path;
                 $.get(urlPath + indexTrailing, null, function(data) {
                     var lastToc = $($.parseHTML(data)).find('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul')
@@ -65,7 +69,7 @@ function render_lefttoc() {
     }
     else {
         var toc = $('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul').clone();
-        //render_left_helper(toc);
+        render_left_helper(toc);
         $('.sphinxsidebar').css("visibility", "visible");
     }
 }
@@ -218,7 +222,7 @@ $(document).ready(function () {
                 }
             }
             render_righttoc();
-            //if ($('.leftsidebar').length) render_lefttoc();
+            if ($('.leftsidebar').length) render_lefttoc();
         }
         if ($('div.sphinxsidebar').css('visibility') == 'hidden') $('.content').css('width', '100%');
         if (url.indexOf('/api/') != -1) return;
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 6588ec0cca85..f7f5997ab4ef 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -20,10 +20,11 @@ <h1 id="logo-wrap">
           <ul id="package-dropdown-menu" class="dropdown-menu navbar-menu">
             <li><a class="main-nav-link" href="{{url_root}}api/python/index.html">Python</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/c++/index.html">C++</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}api/clojure/index.html">Clojure</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/julia/index.html">Julia</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/perl/index.html">Perl</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/r/index.html">R</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
           </ul>
         </span>
 
@@ -50,7 +51,7 @@ <h1 id="logo-wrap">
           </ul>
         </span>
       </nav>
-      
+
       <script> function getRootPath(){ return "{{url_root}}" } </script>
       <div class="burgerIcon dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button">☰</a>
@@ -71,7 +72,7 @@ <h1 id="logo-wrap">
               <li class="dropdown-submenu">
                 <a href="#" tabindex="-1" role="button" aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown">{{name}}</a>
                 <ul class="dropdown-menu">
-                  {% for lang in ['Python', 'Scala', 'R', 'Julia', 'C++', 'Perl'] %}
+                  {% for lang in ['Python', 'C++', 'Clojure', 'Julia',  'Perl', 'R', 'Scala'] %}
                     <li><a tabindex="-1" href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">{{lang}}</a>
                     </li>
                   {% endfor %}
diff --git a/docs/build_version_doc/artifacts/mxnet.css b/docs/build_version_doc/artifacts/mxnet.css
new file mode 100644
index 000000000000..cb771967b622
--- /dev/null
+++ b/docs/build_version_doc/artifacts/mxnet.css
@@ -0,0 +1,1479 @@
+/*-------------------- AmazonEmber font -----------------------------------*/
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Rg.ttf);
+    font-weight: normal;
+}
+
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Bd.ttf);
+    font-weight: bold;
+}
+
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Th.ttf);
+    font-weight: 300;
+}
+
+
+/*-------------------- basic html elements and classes --------------------*/
+html, body {
+    margin: 0;
+    padding: 0;
+    background-color: #ffffff;
+}
+
+body {
+    display: block;
+    visibility: hidden;
+    background-size: cover;
+    background-attachment: fixed;
+}
+
+body, div {
+    font-family: AmazonEmber;
+    font-size: 16px;
+    color: #000;
+}
+
+p {
+    font-family: AmazonEmber;
+    color: #000;
+    font-size: 16px;
+    line-height: 1.5em;
+}
+
+#language-filter {
+    float: right;
+    margin-top: 5px;
+}
+
+li, dt a, dt span {
+    font-size: 16px;
+    line-height: 1.5em;
+}
+
+.footer {
+    z-index: 10;
+    position: relative;
+    bottom: 0;
+    width: 100%;
+}
+
+/* /\*Content paragraph space*\/ */
+/* div.content p { */
+/*     margin-top: 20px; */
+/* } */
+
+/*------------------ Headings -------------------*/
+h1, h2, h3, h4 {
+    font-family: AmazonEmber;
+    margin-top: 18px;
+    color: #444;
+}
+
+h1 {
+    margin-top: 5px;
+    margin-bottom: 25px;
+    font-size: 28px;
+    font-weight: bold;
+}
+
+h2 {
+    font-size: 24px;
+    font-weight: bold;
+    margin-bottom: 15px;
+}
+
+h3 {
+    font-size: 20px;
+    font-weight: bold;
+    margin-bottom: 10px;
+}
+
+h4 {
+    font-size: 16px;
+    font-weight: bold;
+    margin-bottom: 10px;
+}
+
+img {
+    border-style: hidden;
+    border-width: 0px 0px 0px 0px;
+    border-color: #ffffff;
+    /* white  */
+    padding: 0px 0px 0px 0px;
+    margin: 0px 0px 0px 0px;
+}
+
+.topictitle {
+    font-size: 24px;
+    font-weight: bold;
+    color: #e47911;
+    padding: 0 0 14px 0;
+}
+
+.section:before {
+    content: " ";
+    display: block;
+    height: 60px; /* fixed header height*/
+    margin: -60px 0 0; /* negative fixed header height */
+}
+
+/*-----------------------nav bar-----------------------*/
+.navbar {
+    opacity: 0.9;
+    border: 0px;
+    height: 60px;
+    margin-bottom: 0px;
+    border-bottom: 0.01em solid #fff;
+    background-image: url("https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-navbar.png");
+    background-size: cover;
+    background-attachment: scroll;
+}
+
+.navbar .container {
+    position: relative;
+    width: 100%;
+}
+
+@media screen and (max-width: 510px) {
+    .navbar {
+        height: 60px;
+    }
+}
+
+#header-inner {
+    display: -webkit-box;
+    display: -webkit-flex;
+    display: -ms-flexbox;
+    display: box;
+    display: flex;
+    -webkit-box-orient: horizontal;
+    -moz-box-orient: horizontal;
+    -webkit-box-lines: single;
+    -moz-box-lines: single;
+    -webkit-flex-flow: row nowrap;
+    -ms-flex-flow: row nowrap;
+    flex-flow: row;
+    -webkit-box-align: center;
+    -ms-flex-align: center;
+    -webkit-align-items: center;
+    align-items: center;
+    padding-left: 10px;
+}
+
+@media screen and(max-width: 768 px) {
+    #header-inner {
+        -webkit-box-pack: center;
+       -ms-flex-pack: center;
+       -webkit-justify-content: center;
+        justify-content: center
+    }
+}
+
+#logo-wrap {
+    -webkit-box-flex: 1;
+   padding-top: 0;
+    margin-top: 0;
+    width: 125px;
+    padding-left: 25px;
+    padding-right: 200px;
+}
+
+#logo {
+    color: #fff;
+    width: 150px;
+    display: block;
+    float: left;
+    height: 60px;
+    padding: 15px 0 0 0;
+    text-decoration: none;
+}
+
+#logo > img {
+  display: block;
+  width: 110px;
+}
+
+.nav-bar {
+    display: block;
+    -webkit-box-flex: 1;
+    flex: 1 auto;
+    position: relative;
+    top: -12px;
+    white-space: nowrap;
+}
+
+.main-nav-link {
+    color: #fff;
+    text-decoration: none;
+    line-height: 50px;
+    font-family: AmazonEmber;
+    display: inline-block;
+    padding: 0 15px;
+    font-weight: 500;
+}
+
+.main-nav-link:hover {
+    opacity: 1;
+    color: #fff;
+    text-decoration: none;
+}
+
+#dropdown-menu-position-anchor,
+#dropdown-menu-position-anchor-docs,
+#dropdown-menu-position-anchor-community,
+#dropdown-menu-position-anchor-version,
+#dropdown-menu-position-anchor-version-mobile {
+    position: relative;
+}
+
+.navbar-menu {
+    top: 36px;
+    border-radius: 4px;
+    padding: 0;
+}
+
+.navbar-menu > li > a {
+    color: #0079b2;
+    padding: 6px 16px;
+
+}
+
+#search-input-wrap {
+    display: block;
+    position: absolute;
+    top: 15px;
+    right: 20px;
+    width: 200px;
+    -webkit-transition: width 0.5s;
+    transition: width 0.5s;
+}
+
+#search-input-wrap ul {
+    list-style-type: none;
+}
+
+#searchIcon {
+    display: none;
+    position: fixed;
+    top: 20px;
+    right: 35px;
+    color: white;
+}
+
+.searchBox {
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 200px;
+    background: transparent;
+    border-radius: 5px;
+    -webkit-transition: width 0.5s;
+    transition: width 0.5s;
+}
+
+.searchBox input {
+    background: transparent;
+}
+
+.searchBox input::placeholder {
+    font-weight: 200;
+}
+
+.searchBoxNorm input::placeholder {
+    color: white;
+}
+
+.searchBoxExp input::placeholder {
+    color: dimgray;
+}
+
+.searchBtn {
+    display: none;
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 40px;
+}
+
+@media screen and (max-width: 510px) {
+    #search-input-wrap {
+        width: 120px;
+        margin-right: 10px;
+    }
+
+    .searchBox {
+        width: 120px;
+    }
+}
+
+#lang-select-wrap {
+    display: block;
+    position: fixed;
+    right: 120px;
+    top: 5px
+}
+
+#lang-select-label {
+    color: #fff;
+    opacity: .7;
+    font-family: AmazonEmber;
+    line-height: 50px
+}
+
+#lang-select-label span {
+    padding-left: 8px
+}
+
+#lang-select-label i {
+    opacity: .7
+}
+
+#lang-select {
+    opacity: 0;
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    -webkit-appearance: menulist-button;
+    font-size: inherit
+}
+
+.github-btn { border: 0; overflow: hidden }
+
+.boxed {
+    border: 1px solid rgb(57, 144, 211);
+    background: #0079b2;
+    color: white;
+    font-weight: bold
+}
+
+.boxed-bordered {
+    border: 3px solid rgba(0,0,0,.05);
+    background: #0079b2;
+    font-weight: bold;
+    color: white;
+}
+
+/*burger icon*/
+div .burgerIcon {
+    font-size: 25px;
+    position: fixed;
+    right: 155px;
+    top: 11px;
+    display: none;
+    width: 30px;
+}
+
+div .burgerIcon a {
+    color: white;
+}
+
+/*burger menu*/
+#burgerMenu, #plusMenu {
+    list-style-type: none;
+    margin-top: 10px;
+    padding: 0;
+    background-color: white;
+    list-style-position: inside;
+    min-width: 100px;
+}
+
+#burgerMenu a, #plusMenu a {
+    display: table-cell;
+    font-size: 15px;
+    color: #0079b2;
+    vertical-align: middle;
+    height: 30px;
+    padding-left: 5px;
+    width: 108px;
+    max-width: 250px;
+}
+
+#plusMenu .dropdown-submenu>.dropdown-menu {
+    left: 127px;
+    top: 0;
+    background: #fff;
+}
+
+#burgerMenu li, #plusMenu li {
+    height: 30px;
+    list-style-position: inside;
+}
+
+li.dropdown-submenu ul.dropdown-menu {
+    min-width: 75px;
+}
+
+li.dropdown-submenu ul.dropdown-menu li {
+    border-style: none !important;
+}
+
+li.dropdown-submenu ul.dropdown-menu a {
+    padding-left: 20px !important;
+}
+
+@media screen and (max-width: 420px) {
+    li.dropdown-submenu ul.dropdown-menu a {
+        font-size: 12px !important;
+    }
+}
+
+/*dropdown submenu*/
+.dropdown-submenu {
+    position: relative;
+}
+
+.dropdown-submenu>.dropdown-menu {
+    top: -8px;
+    left: 250px;
+    -webkit-border-radius: 0 6px 6px 6px;
+    -moz-border-radius: 0 6px 6px;
+    border-radius: 0 6px 6px 6px;
+    background: transparent;
+}
+
+.dropdown-submenu:hover>.dropdown-menu {
+    display: block;
+}
+
+.dropdown-submenu>a:after {
+    display: block;
+    content: " ";
+    float: right;
+    width: 0;
+    height: 0;
+    border-color: transparent;
+    border-style: solid;
+    border-width: 5px 0 5px 5px;
+    border-left-color: #ccc;
+    margin-top: 10px;
+    margin-right: -10px;
+}
+
+.dropdown-submenu:hover>a:after {
+    border-left-color: #fff;
+}
+
+.dropdown-submenu.pull-left {
+    float: none;
+}
+
+.dropdown-submenu.pull-left>.dropdown-menu {
+    left: -100%;
+    margin-left: 10px;
+    -webkit-border-radius: 6px 0 6px 6px;
+    -moz-border-radius: 6px 0 6px 6px;
+    border-radius: 6px 0 6px 6px
+}
+
+/*Plus icon*/
+.plusIcon {
+    display: none;
+    position: absolute;
+    top: 20px;
+    right: 210px;
+    width: 30px;
+}
+
+.glyphicon-plus {
+    color: white;
+}
+
+@media screen and (max-width:600px) {
+    div .plusIcon, .nav-bar {
+        display: none !important;
+    }
+
+    div.burgerIcon {
+        display: block;
+        position: fixed;
+        left: 45px;
+    }
+
+    #burgerMenu {
+        position: fixed;
+        left: 0;
+        top: 49px;
+        width: 100%;
+        height: 100%;
+        background: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background-compressed.jpeg);
+        background-attachment: initial;
+        background-size: cover;
+    }
+
+    #burgerMenu a {
+        display: block;
+        font-size: 19px;
+        width: auto;
+        padding-left: 75px;
+        padding-top: 10px;
+        color: #fff;
+        text-decoration: none;
+    }
+
+    #burgerMenu a:hover {
+        color: initial;
+        background-color: inherit;
+    }
+
+    #burgerMenu li {
+        border-bottom: 1px solid lightgray;
+        height: 50px;
+    }
+
+    #logo-wrap {
+        margin-left: 30px;
+    }
+}
+/*--------------------------banner---------------------------*/
+#splash{
+    padding:60px 0 0 0;
+    background-size:cover;
+    background-attachment:fixed;
+    color:#fff;
+    text-align:center
+}
+
+#splash #banner {
+    text-align: center
+}
+
+#banner-title span {
+    font-family: AmazonEmber;
+    font-weight: bold;
+}
+
+#banner-title {
+    padding: 60px 0 10px 0;
+    font-size: 40px;
+    line-height: 55px;
+    font-weight: 500;
+    font-family: AmazonEmber;
+    color: #fff;
+    max-width: 600px;
+    text-align: left;
+    margin-left: 30px;
+}
+
+#landing-title {
+    margin-right: 40px;
+    font-size: 35px;
+    color: #fff;
+    font-family: AmazonEmber;
+    font-weight: 300;
+}
+
+#intro {
+    font-size: 16px;
+    color: #fff;
+    font-family: AmazonEmber;
+    font-weight: 300;
+}
+
+@media screen and (max-width:768px) {
+    #banner-title {
+        font-size: 30px;
+        padding: 60px 30px 10px 30px;
+        margin-left: 0;
+    }
+}
+
+#splash h1{
+    font-size: 40px;
+    margin-bottom: 20px;
+}
+
+#splash #social{
+    margin:2em 0 4em 0;
+}
+
+#landing-btn-blk {
+    margin-top: 2em;
+}
+
+#why_mxnet, #install_blk {
+    margin:2em 0;
+    display: inline;
+}
+
+#why_mxnet {
+    padding-left: 20px;
+}
+
+@media screen and (max-width:400px) {
+    #install_blk, #install_blk{
+        display: block;
+    }
+
+    #install_blk {
+        padding-left: 0;
+        margin-top: 0;
+    }
+}
+
+#why_mxnet_btn {
+    border: 1.8px solid #FFFFFF;
+    color: #FFFFFF;
+    display: inline;
+    font-size: 18px;
+    font-family: AmazonEmber;
+    padding: 8px 20px;
+    -webkit-transition: .2s;
+    transition: .2s;
+}
+
+#install_btn {
+    border: 1.8px solid #FFFFFF;
+    color: #fff;
+    display: inline;
+    font-size: 18px;
+    font-family: AmazonEmber;
+    padding: 8px 40px;
+    -webkit-transition: .2s;
+    transition: .2s;
+}
+
+#release_btn {
+    width: auto;
+    border: none;
+}
+
+#why_mxnet_btn:hover, #install_btn:hover {
+    background-color: #FFFFFF;
+    color: #0079b2;
+    opacity: 0.9;
+    text-decoration: none;
+}
+
+/*------------------------index page section----------------------------*/
+.section-tout {
+    padding:3em 0 1em;
+}
+
+.section-tout .container {
+    height: 220px;
+}
+
+.section-tout .row {
+    height: 100%;
+}
+
+.section-tout .row div {
+    height: 110%;
+    padding-left: 50px;
+    background-color:#fff;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+.section-tout .row a {
+    font-family: AmazonEmber;
+    position: absolute;
+    bottom: 20px;
+    border: solid 1px;
+    padding: 5px 10px;
+    color: #111;
+}
+
+.section-tout h3{
+    font-family: AmazonEmber;
+    font-size:20px;
+    color:  #007dbc;
+    padding-top: 10px
+}
+
+.section-tout p {
+    font-family: AmazonEmber;
+    margin-bottom:2em;
+}
+
+@media (max-width: 1199px) {
+    .section-tout .container {
+        height: auto;
+    }
+
+    .section-tout .row a {
+        position: inherit;
+    }
+
+    .section-tout .row div {
+        padding-left: 45px;
+    }
+
+    .section-tout .row p {
+        padding-bottom: 10px;
+    }
+}
+
+@media (max-width: 767px) {
+    .section-tout .row div {
+        margin-bottom: -20px;
+    }
+}
+
+.section-util {
+    padding:3em 0 3em;
+    text-align: left;
+    margin-bottom: 50px;
+}
+
+.section-util .row > div {
+    height: 320px;
+    padding-left: 50px;
+    padding-top: 40px;
+}
+
+.section-util .row div h2 {
+    font-family: AmazonEmber;
+    color: #fff;
+}
+
+#model-zoo-blk {
+    background-color:  #00446e;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+#example-blk {
+    background-color:  #006ea7;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+#tutorial-blk {
+    background-color:  #00a1c9;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+@media (min-width: 1200px) {
+    #model-zoo-blk {
+        width: 370px;
+        margin-right: 20px;
+    }
+
+    #example-blk {
+        width: 370px;
+        margin-right: 20px;
+    }
+}
+
+.section-util p {
+    font-family: AmazonEmber;
+    color: #fff;
+    position: absolute;
+    width: 70%;
+    font-size: 14px;
+    font-weight: 300;
+}
+
+.section-util .util-btn {
+    position: absolute;
+    bottom: 30px;
+}
+
+@media (max-width: 600px) {
+    .section-util .util-btn {
+        margin-top: 100px;
+    }
+}
+
+.section-util .glyphicon {
+    font-size: 50px;
+    color: #b3ffff;
+}
+
+.util-btn a {
+    font-family: AmazonEmber;
+    display: inline-block;
+    border: 1.8px solid;
+    -webkit-transition: .2s;
+    transition: .2s;
+    padding: 5px 30px;
+    color: #fff;
+}
+
+.util-btn a:hover {
+    background-color: #0079b2;
+    color: #FFFFFF;
+    opacity: 0.9;
+    text-decoration: none;
+}
+
+.section-util .container {
+    height: 250px;
+}
+
+@media (max-width: 1199px) {
+    .section-util .container {
+        height: auto
+    }
+}
+
+.section-inst {
+    padding:3em 0 3em;
+    border-bottom:1px solid rgba(0,0,0,.05);
+    text-align: left;
+    background-color: #fff;
+}
+
+.section-inst .row div {
+    text-align: center;
+    margin-bottom: 30px;
+    height: 80px;
+}
+
+.section-inst p {
+    font-family: AmazonEmber;
+    font-weight: 300;
+    margin-bottom:2em;
+    padding-left: 50px;
+    color: #111;
+}
+
+.section-inst h2 {
+    font-family: AmazonEmber;
+    margin-top: -20px;
+    margin-bottom: 50px;
+    color: dimgray;
+    padding-left: 40px;
+}
+
+@media (min-width: 1200px) {
+    .section-inst .row div.clear {
+        clear: both;
+    }
+}
+
+@media (max-width: 1199px) {
+    .section-inst .row div.smallClear {
+        clear: both;
+    }
+}
+
+.section-inst img {
+    -webkit-filter: grayscale(90%); /* Chrome, Safari, Opera */
+    filter: grayscale(90%);
+    margin-bottom:2em
+}
+.section-inst img:hover {
+    -webkit-filter: grayscale(0%); /* Chrome, Safari, Opera */
+    filter: grayscale(0%);
+}
+
+.section-disclaimer {
+    padding: 3em 0 3em;
+    background-color:  #f2f2f2;
+}
+
+.section-disclaimer .container div {
+    padding-left: 50px;
+}
+
+.section-disclaimer p {
+    padding-top: 2em;
+}
+
+.footer li{
+    float:right;
+    margin-right:1.5em;
+    margin-bottom:1.5em
+}
+.footer p{
+    font-family: AmazonEmber;
+    font-weight: 300;
+    font-size: 12px;
+    clear:right;
+    margin-bottom:0;
+}
+
+/*------------------content----------------------*/
+div.navbar + div.container {
+    padding-top: 65px;
+}
+
+div.content {
+    padding: 80px 40px 40px 45px;
+    background: #fff;
+    width: calc(100% - 300px);
+    float: right;
+    overflow-x: hidden;
+    z-index: -1;
+}
+
+@media (max-width: 999px) {
+    div.content {
+        overflow-x: auto;
+        width: 100%;
+    }
+}
+
+div.page-tracker {
+    margin-bottom: 30px;
+    font-family: AmazonEmber;
+}
+
+div.page-tracker ul {
+    padding-left: 0;
+}
+
+div.page-tracker ul li {
+    text-transform: capitalize;
+    display: inline;
+}
+
+div.page-tracker a {
+    text-transform: capitalize;
+    color: #337ab7;
+}
+
+div.page-tracker li:last-child {
+    color: #aab7b8;
+}
+
+div.page-tracker i {
+    margin: 0 10px;
+}
+
+/*------------------sidebar-----------------------*/
+div.sphinxsidebar {
+    position: fixed;
+    padding-top: 40px;
+    height: 100%;
+    display: none;
+    visibility: hidden;
+    overflow: auto;
+}
+
+div.leftsidebar {
+    width: 300px;
+    margin-left: 25px;
+    background: #f2f2f2;
+}
+
+div.rightsidebar {
+    right: 20px;
+    width: 200px;
+    padding: 40px 0 15px 0;
+}
+
+div.sphinxsidebar ul {
+    padding: 0;
+    list-style-type: none !important;
+}
+
+div.sphinxsidebar a {
+    font-family: AmazonEmber;
+    font-size: 13px;
+}
+
+div.rightsidebar div.sphinxsidebarwrapper > ul {
+    padding-left: 15px;
+}
+
+div.sphinxsidebar li {
+    padding-top: 5px;
+    margin-bottom: 5px;
+    margin-left: -10px;
+}
+
+div.leftsidebar > div.sphinxsidebarwrapper {
+    padding: 40px 20px 10px 20px;
+    background: #f2f2f2;
+}
+
+div.leftsidebar > div.sphinxsidebarwrapper > ul > li {
+    margin-left: 20px;
+}
+
+div.rightsidebar > div.sphinxsidebarwrapper > ul {
+    margin-left: 2px;
+    border-left: 4px solid #337ab7;
+}
+
+div.rightsidebar > div.sphinxsidebarwrapper > ul > li {
+    margin-left: 5px;
+}
+
+div.sphinxsidebar li.opened .tocToggle:before {
+    font-family: 'FontAwesome';
+    content: "\f0d7";
+    margin: 0 5px 0 -15px;
+    color: #999999;
+}
+
+div.sphinxsidebar li.closed .tocToggle:before {
+    font-family: 'FontAwesome';
+    content: "\f0da";
+    margin: 0 5px 0 -15px;
+    color: #999999;
+}
+
+div.sphinxsidebar li.leaf .tocToggle:before {
+    /*font-family: 'FontAwesome';*/
+    content: " ";
+    margin: 0 5px 0 -15px;
+    color: #f2f2f2;
+}
+
+@media (min-width:1015px) {
+    div.sphinxsidebar {display: block}
+}
+
+div.sphinxsidebarwrapper h3 {
+    padding-top: 0;
+}
+
+/*-------------------- table styles --------------------*/
+table {
+  margin-bottom: 10px;
+  border-collapse: collapse; }
+
+table p {
+  font-size: 16px; }
+
+td p:first-child {
+  margin-top: 0px; }
+
+td p:last-child {
+  margin-bottom: 0px; }
+
+td ul:first-child {
+  margin-top: 0px;
+  /*  new 8/22/07 */ }
+
+td ol:first-child {
+  margin-top: 0px;
+  /*  new 8/22/07 */ }
+
+div.table {
+  position: relative; }
+
+div.informaltable {
+  position: relative; }
+
+.table-expand-icon {
+  position: absolute;
+  right: -20px;
+  font-size: 24px;
+  background: #cccccc;
+  float: right;
+  bottom: 0;
+  cursor: pointer; }
+
+.table-contents table {
+  border-top: 1px solid #cccccc;
+  /* medium gray  */
+  border-left: 1px solid #cccccc;
+  /* medium gray  */
+  border-bottom: 0px;
+  border-right: 0px;
+  border-spacing: 0px; }
+
+.table-contents td {
+  font-size: 16px;
+  padding: 5px 5px 5px 5px;
+  border-bottom: 1px solid #cccccc;
+  /* medium gray  */
+  border-right: 1px solid #cccccc;
+  /* medium gray  */
+  border-left: 0px;
+  border-top: 0px;
+  vertical-align: top; }
+
+.table-contents th {
+  font-size: 16px;
+  padding: 5px 5px 5px 5px;
+  border-bottom: 1px solid #cccccc;
+  /* medium gray  */
+  border-right: 1px solid #cccccc;
+  /* medium gray  */
+  border-left: 0px;
+  border-top: 0px;
+  vertical-align: top;
+  background-color: #eeeeee;
+  /* light gray  */
+  color: #333333;
+  /* Dark gray  */
+  font-size: 16px;
+  font-weight: bold;
+  text-align: left; }
+
+/*----------------API class and function formatting---------------------*/
+p.rubric {
+    margin-top: 10px;
+}
+
+dl {
+    padding-top: 20px;
+}
+
+dt:target, .highlighted {
+    background-color: #e7f2fa;
+    border-bottom: 3px solid #c7254e;
+    margin-bottom: -3px;
+}
+
+dt:target:before {
+    background-color: white;
+    content: '';
+    display: block;
+    height: 60px;
+}
+
+dt {
+    background: #e7f2fa;
+    border-bottom: solid #0079b2;
+}
+
+dl.method dt {
+    background: #f0f0f0;
+    border-bottom: solid #ccc;
+}
+
+dl.method dt code.descname {
+    color:#555;
+}
+
+dl.attribute dt {
+    background: #f0f0f0;
+    border-bottom: solid #ccc;
+}
+
+dl.attribute dt code.descname {
+    color:#555;
+}
+
+dt em {
+    font-weight: normal;
+    font-style: normal;
+    font-size: 90%;
+}
+
+code {
+    color: #337ab7;
+    background-color: #f5f5f5;
+}
+
+code.docutils.literal {
+    color: #111;
+    font-weight: bold;
+}
+
+a code.docutils.literal {
+    color: #337ab7;
+}
+
+dt code {
+    color: #555;
+}
+
+dl.last.docutils dt{
+    background-color: transparent;
+    border-bottom: none;
+}
+
+dl.docutils dt {
+    color: #555;
+    background-color: #f0f0f0;
+    border-bottom: solid #ccc;
+}
+
+/*----------------Model zoo page style------------------*/
+#mxnet-model-zoo table, #mxnet-model-zoo td, #mxnet-model-zoo th {
+    border: 1px solid lightgray;
+}
+
+#mxnet-model-zoo table {
+    margin-top: 30px;
+    border-collapse: collapse;
+}
+
+#mxnet-model-zoo th {
+    vertical-align: middle;
+    text-align: center;
+    height: 50px;
+}
+
+#mxnet-model-zoo td {
+    vertical-align: middle;
+    height: 50px;
+    padding: 10px;
+}
+
+#mxnet-model-zoo tr:nth-child(even) {
+    background-color: #f2f2f2;
+}
+
+/*--------- note block -----------*/
+div.admonition {
+    padding: 15px;
+    background-color: #f6f8fa;
+    border: 1px solid rgb(204, 204, 204);
+    border-radius: 4px;
+}
+
+p.admonition-title {
+    color: #187dbb;
+}
+
+/*------------ code block ------------*/
+pre {
+    background-color: #f6f8fa;
+}
+
+/*-------------API table---------------------*/
+table.docutils {
+    margin-top: 15px;
+}
+
+table.docutils td, table.docutils th {
+    padding-top: 5px;
+    padding-bottom: 5px;
+    padding-left: 10px;
+    padding-right: 10px;
+    /* border-top: 0; */
+    /* border-left: 0; */
+    /* border-right: 0; */
+    border: 1px solid rgb(223, 226, 229);
+}
+
+table.docutils tr:nth-child(even) {
+    background-color: #f6f8fa;
+}
+
+/*---------------getting started homepage---------*/
+
+#lang-demo ul {
+    margin-top: 20px;
+    margin-bottom: 15px;
+}
+
+.option-title {
+    width: 100px;
+    float: left;
+    clear: none;
+    text-align: right;
+    font-size: 14px;
+    padding-top: 7px;
+    padding-bottom: 8px;
+    padding-right: 10px;
+    font-weight: bold;
+}
+.option-row {
+    padding-bottom: 8px;
+}
+
+.install-inst {
+}
+
+.btn-default:hover, .btn-default:focus, .btn-default:active,
+.btn-default.active.focus, .btn-default.active:focus, .btn-default.active:hover,
+.btn-default:active.focus, .btn-default:active:focus, .btn-default:active:hover,
+.btn-default.active, .open>.dropdown-toggle.btn-default,
+.btn-default:active:focus {
+    color: #fff;
+    background-color: #0079b2;
+    border-color: #0079b2;
+}
+
+.btn-default:hover a {
+    color: #fff;
+}
+
+#setup-options {
+    margin-top: 15px;
+    margin-bottom: 15px;
+    margin-left: 30px;
+}
+
+.opt-group {
+    margin-top: 10px;
+    margin-bottom: 10px;
+}
+
+/*----------------Copy code button------------------------*/
+.copy-btn {
+    display: none;
+    position: absolute;
+    right: 0;
+    width: 40px;
+    height: 38px;
+}
+
+/*----------------Search function style------------------*/
+#searchPage {
+    width: 60%;
+    margin-left: 20%;
+}
+
+#search-preview {
+    display: none;
+    background-color: white;
+    margin-top: -10px;
+    border: solid;
+    border-width: 1px;
+    border-color: #87CEFA;
+    width: 100%;
+}
+
+
+#search-preview p {
+    color: dimgray;
+    padding: 5px 20px 0 20px;
+}
+
+#search-preview li {
+    padding-top: 10px;
+}
+
+#search-preview ul {
+    padding-left: 20px;
+    padding-right: 20px;
+}
+
+#search-preview hr {
+    margin: 10px 0 10px 0;
+    border-top: 1px solid dimgray;
+}
+
+#search-results ul {
+    list-style-type: none;
+}
+
+@media screen and (max-width: 510px) {
+    #search-preview {
+        position: fixed;
+        left: 0;
+        width: 100vw;
+    }
+
+    #search-preview li {
+        font-size: 12px;
+    }
+}
+
+/* enable absolute positioning */
+.inner-addon {
+    position: relative;
+}
+
+/* style icon */
+.inner-addon .glyphicon {
+    position: absolute;
+    padding: 10px;
+    pointer-events: none;
+}
+
+/* align icon */
+.left-addon .glyphicon  {
+    right: 0px;
+}
+.right-addon .glyphicon {
+    right: 0px;
+}
+
+/* add padding  */
+.left-addon input  {
+    font-family: AmazonEmber;
+    font-style: italic;
+}
+.right-addon input {
+    padding-right: 40px;
+}
+
+.form-group .glyphicon-search {
+    color: white;
+}
+
+/*----------------------Get started page----------------------------*/
+#why-mxnet img {
+    position: relative;
+    width: 60%;
+    left: 50%;
+    margin-right: -50%;
+    transform: translate(-50%);
+}
+
+button.download {
+    color: #0079b2;
+}
+
+/*----------------------Download button------------------------*/
+div.download-btn {
+
+    border: solid 1px #000;
+    border-radius: 3px;
+    font-size: 90%;
+    height: 40px;
+    display: table;
+    float: left;
+}
+
+div.download-btn a {
+    padding: 0 10px;
+    display: table-cell;
+    vertical-align: middle;
+    color: #000;
+}
+
+div.download-btn a:hover {
+    background-color: #0079b2;
+    color: white;
+    text-decoration: none;
+}
+
+div.download-btn.download-btn-top {
+    border-color: #ec7211;
+    background: #ec7211;
+    float: right;
+}
+
+div.download-btn.download-btn-top a {
+    color: #fff;
+}
+
+/*-------------output blocks----------------*/
+
+.highlight-results .highlight pre {
+    background-color: #eeffcc;
+}
+
+.cell-results-header {
+    color: #888;
+    padding-bottom: 3px;
+    font-style: italic;
+}
+
+/*------------Download source-----------------*/
+#download-source-package {
+    display: none;
+    padding-top: 40px;
+}
+
+/*------------Mobile dropdown menu smooth transition---------------*/
+@keyframes slide {
+    0% {
+        left: -100%;
+    }
+    25% {
+        left: -75%;
+    }
+    50% {
+        left: -50%;
+    }
+    75% {
+        left: -25%;
+    }
+    100% {
+        left: 0;
+    }
+}
+
+#burgerMenu {
+    animation-name: slide;
+    animation-duration: 0.3s;
+    animation-timing-function: ease-in-out;
+}
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 44cd540fda0b..d36f1f5edc64 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -19,20 +19,31 @@
 
 # This script is for locally building website for all versions
 # Built files are stored in $built
-
-# Takes two arguments:
-# tag list - semicolon delimited list of Github tags
+# Default repo is mxnet_url="https://github.com/apache/incubator-mxnet.git"
+# Default build directory is mxnet_folder="apache-mxnet"
+# Takes two required arguments and one optional:
+# tag list (required)- semicolon delimited list of Github tags
 #   Example: "1.2.0;1.1.0;master"
-# display list - semicolon delimited list of what to display on website
+# display list (required) - semicolon delimited list of what to display on website
 #   Example: "1.2.1;1.1.0;master"
-# The number of tags for the two arguments must be the same.
+# NOTE: The number of tags for the two arguments must be the same.
+# Repo URL (optional) - a GitHub URL that is a fork of the MXNet project
+#   When this is used the build directory will be {github_username}-mxnet
+
 # Example Usage:
-#   ./build_all_version.sh "1.2.0;1.1.0;master" "1.2.1;1.1.0;master"
+#  Build the content of the 1.2.0 branch in the main repo to the 1.2.1 folder.
 #   ./build_all_version.sh "1.2.0" "1.2.1"
+#  Using the main project repo, map the 1.2.0 branch to output to a 1.2.1 directory; others as is:
+#   ./build_all_version.sh "1.2.0;1.1.0;master" "1.2.1;1.1.0;master"
+#  Using a custom branch and fork of the repo, map the branch to master,
+#    map 1.2.0 branch to 1.2.1 and leave 1.1.0 in 1.1.0:
+#   ./build_all_version.sh "sphinx_error_reduction;1.2.0;1.1.0" \
+#   "master;1.2.1;1.1.0" https://github.com/aaronmarkham/incubator-mxnet.git
 
 set -e
 set -x
 
+# $1 is the list of branches/tags to build
 if [ -z "$1" ]
   then
     echo "Please provide a list of branches or tags you wish to build."
@@ -44,6 +55,7 @@ if [ -z "$1" ]
     build_arr=($tag_list)
 fi
 
+# $2 is the list of output folders which will be displayed on the site
 if [ -z "$2" ]
   then
     echo "Please provide a list of version tags you wish to display on the site."
@@ -58,13 +70,61 @@ if [ -z "$2" ]
     done
 fi
 
-mxnet_url="https://github.com/apache/incubator-mxnet.git"
-mxnet_folder="apache_mxnet"
+# $3 is the GitHub project URL or fork
+if [ -z "$3" ]
+  then
+    echo "Using the main project URL."
+    mxnet_url="https://github.com/apache/incubator-mxnet.git"
+    mxnet_folder="apache-mxnet"
+  else
+    mxnet_url=$3
+    fork=${mxnet_url##"https://github.com/"}
+    fork_user=${fork%%"/incubator-mxnet.git"}
+    mxnet_folder=$fork_user"-mxnet"
+    echo "Building with a user supplied fork: $mxnet_url"
+fi
+
+# This is the output folder
 built="VersionedWeb"
 
+
+function create_repo () {
+  repo_folder=$1
+  mxnet_url=$2
+  git clone $mxnet_url $repo_folder --recursive
+  echo "Adding MXNet upstream repo..."
+  cd $repo_folder
+  git remote add upstream https://github.com/apache/incubator-mxnet
+  cd ..
+}
+
+
+function refresh_branches () {
+  repo_folder=$1
+  cd $repo_folder
+  git fetch
+  git fetch upstream
+  cd ..
+}
+
+
+function checkout () {
+  repo_folder=$1
+  cd $repo_folder
+  # Overriding configs later will cause a conflict here, so stashing...
+  git stash
+  # Fails to checkout if not available locally, so try upstream
+  git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder"
+  if [ $tag == 'master' ]; then
+    git pull
+  fi
+  git submodule update --init --recursive
+  cd ..
+}
+
+
 if [ ! -d "$mxnet_folder" ]; then
   mkdir $mxnet_folder
-  git clone $mxnet_url $mxnet_folder --recursive
 fi
 
 if [ ! -d "$built" ]; then
@@ -81,25 +141,27 @@ fi
 for key in ${!build_arr[@]}; do
     tag=${build_arr[${key}]}
     cd "$mxnet_folder"
-    git fetch
-    if [ $tag == 'master' ]
-        then
-            git checkout master
-            git pull
-            echo "Building master..."
-        else
-            # Use "v$tag" for branches or pass that in from jenkins
-            git checkout "$tag"
-            echo "Building $tag..."
+
+    # Each tag will get its own subfolder
+    if [ ! -d "$tag" ]; then
+      create_repo "$tag" "$mxnet_url"
     fi
 
-    git submodule update --init --recursive || exit 1
+    refresh_branches $tag
+
+    checkout $tag
+
+    # Bring over the current configurations, so we can anticipate results.
+    cp ../../mxdoc.py $tag/docs/
+    cp ../../settings.ini $tag/docs/
+    cp ../../conf.py $tag/docs/
+    cp ../../Doxyfile $tag/docs/
 
-    make clean
-    cd docs
-    make clean
-    make html USE_OPENMP=1 || exit 1
-    cd ../../
+    echo "Building $tag..."
+    cd $tag/docs
+    make html USE_OPENMP=1 BUILD_VER=$tag || exit 1
+    # Navigate back to build_version_doc folder
+    cd ../../../
     # Use the display tag name for the folder name
     file_loc="$built/versions/${display_arr[${key}]}"
     if [ -d "$file_loc" ] ; then
@@ -107,7 +169,9 @@ for key in ${!build_arr[@]}; do
     fi
     mkdir "$file_loc"
     echo "Storing artifacts for $tag in $file_loc folder..."
-    cp -a "$mxnet_folder/docs/_build/html/." "$file_loc"
+    cp -a "$mxnet_folder/$tag/docs/_build/html/." "$file_loc"
 done
 
 echo "Now you may want to run update_all_version.sh to create the production layout with the versions dropdown and other per-version corrections."
+echo "The following pattern is recommended (tags, default tag, url base):"
+echo "./update_all_version.sh "$tags_to_display " master http://mxnet.incubator.apache.org/"
diff --git a/docs/build_version_doc/setup_docs_ubuntu.sh b/docs/build_version_doc/setup_docs_ubuntu.sh
index d00f00c502fe..22b2fe2c2f71 100755
--- a/docs/build_version_doc/setup_docs_ubuntu.sh
+++ b/docs/build_version_doc/setup_docs_ubuntu.sh
@@ -110,10 +110,10 @@ sudo apt-get update && sudo apt-get install -y \
 wget -nv https://bootstrap.pypa.io/get-pip.py
 echo "Installing for Python 3..."
 sudo python3 get-pip.py
-pip3 install --user -r requirements.txt
+pip3 install --user -r ../../ci/docker/install/docs_requirements
 echo "Installing for Python 2..."
 sudo python2 get-pip.py
-pip2 install --user -r requirements.txt
+pip2 install --user -r ../../ci/docker/install/docs_requirements
 
 
 cd ../../
diff --git a/docs/build_version_doc/update_all_version.sh b/docs/build_version_doc/update_all_version.sh
index e39b0a503412..0c91973bd198 100755
--- a/docs/build_version_doc/update_all_version.sh
+++ b/docs/build_version_doc/update_all_version.sh
@@ -22,21 +22,22 @@
 # It assumes you have already run build_all_version.sh for
 # the tags you want to update.
 
-# Takes three arguments:
-# * tag list - semicolon delimited list of tags to display on site; Example: "1.1.0;1.0.0;master"
-# * default tag - which version should the site default to; Example: 1.0.0
-# * root URL - for the versions dropdown to change to production or dev server; Example: http://mxnet.incubator.apache.org/
+# Takes three required arguments:
+# * tag list (required) - semicolon delimited list of tags to display on site
+#     Example: "1.1.0;1.0.0;master"
+# * default tag (required) - which version should the site default to
+#     Example: 1.0.0
+# * root URL (required) - for the versions dropdown to change to production or
+#     dev server.
+#     Example: http://mxnet.incubator.apache.org/
 
 # Example Usage:
-# ./update_all_version.sh "1.1.0;1.0.0;master" 1.0.0 http://mxnet.incubator.apache.org/
+# ./update_all_version.sh "1.2.1;1.1.0;1.0.0;master" master  \
+#   http://mxnet.incubator.apache.org/
 
 set -e
 set -x
 
-MASTER_SOURCE_DIR="../../docs"
-STATIC_FILES_DIR="_static"
-MXNET_THEME_DIR="_static/mxnet-theme"
-
 if [ -z "$1" ]
   then
     echo "Please provide a list of version tags you wish to run."
@@ -64,7 +65,6 @@ if [ -z "$3" ]
     root_url=$3
 fi
 
-mxnet_folder="apache_mxnet"
 built="VersionedWeb"
 tag_file="tag_list.txt"
 
@@ -85,8 +85,8 @@ function update_mxnet_css {
   # During a nightly build, these fixes will be patched to all the versions in the asf-site repository including the master folder under versions directory.
   # copy <master folder location> <version folder location>
 
-  echo "Copying mxnet.css from master branch to all versions...."
-  cp "$MASTER_SOURCE_DIR/$STATIC_FILES_DIR/mxnet.css"  "$built/versions/$tag/_static"
+  echo "Copying mxnet.css from artifacts folder..."
+  cp "artifacts/mxnet.css"  "$built/versions/$tag/_static"
 
   echo "Update fixes complete.."
 }
@@ -121,12 +121,9 @@ for tag in $tag_list; do
         file_loc="$built/versions/$tag"
     fi
 
-    # Copy the latest README.md from master
+    # Copy the latest README.md; needs to come from local branch
     if [ $tag == 'master' ]; then
-        cd $mxnet_folder
-        git checkout master
-        cp README.md ../$built
-        cd ..
+        cp ../../README.md $built
     fi
 done
 
diff --git a/docs/conf.py b/docs/conf.py
index 0d82221755bc..656a1da96d69 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,7 +107,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['build_version_doc', 'install', 'virtualenv', 'api/python/model.md', 'README.md']
+exclude_patterns = ['3rdparty', 'build_version_doc', 'virtualenv', 'api/python/model.md', 'README.md']
 
 # The reST default role (used for this markup: `text`) to use for all documents.
 #default_role = None
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 6e9a3594168f..15ba225ea86d 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -8,6 +8,18 @@ For example, you can set these environment variables in Linux or macOS as follow
 export MXNET_GPU_WORKER_NTHREADS=3
 ```
 
+Or in powershell:
+```
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+```
+
+## Variables controlling the execution environment
+
+* MXNET_LIBRARY_PATH
+    Absolute path indicating where the mxnet dynamic library is to be located, this would be the absolute
+    path to `libmxnet.so` or `libmxnet.dll` depending on the platform. The logic for loading the
+    library is in `python/mxnet/libinfo.py`
+
 ## Set the Number of Threads
 
 * MXNET_GPU_WORKER_NTHREADS
diff --git a/docs/faq/index.md b/docs/faq/index.md
index ce26b67a8066..1b4a95d3f331 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -53,7 +53,7 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 * [How do I run MXNet on a Raspberry Pi for computer vision?](http://mxnet.io/tutorials/embedded/wine_detector.html)
 
-* [How do I run Keras 1.2.2 with mxnet backend?](https://github.com/dmlc/keras/wiki/Installation)
+* [How do I run Keras 2 with MXNet backend?](https://github.com/awslabs/keras-apache-mxnet/blob/master/docs/mxnet_backend/installation.md)
 
 * [How to convert MXNet models to Apple CoreML format?](https://github.com/apache/incubator-mxnet/tree/master/tools/coreml)
 
diff --git a/docs/install/index.md b/docs/install/index.md
index 57c50eb9bb06..833bedf08afa 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -2,18 +2,18 @@
 
 Indicate your preferred configuration. Then, follow the customized commands to install *MXNet*.
 
-  <div class="dropdown">
-    <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.2.1
-    <span class="caret"></span></button>
-    <ul class="dropdown-menu opt-group">
-      <li class="opt active versions"><a href="#">v1.2.1</a></li>
-      <li class="opt versions"><a href="#">v1.1.0</a></li>
-      <li class="opt versions"><a href="#">v1.0.0</a></li>
-      <li class="opt versions"><a href="#">v0.12.1</a></li>
-      <li class="opt versions"><a href="#">v0.11.0</a></li>
-      <li class="opt versions"><a href="#">master</a></li>
-    </ul>
-  </div>
+<div class="dropdown">
+  <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.2.1
+  <span class="caret"></span></button>
+  <ul class="dropdown-menu opt-group">
+    <li class="opt active versions"><a href="#">v1.2.1</a></li>
+    <li class="opt versions"><a href="#">v1.1.0</a></li>
+    <li class="opt versions"><a href="#">v1.0.0</a></li>
+    <li class="opt versions"><a href="#">v0.12.1</a></li>
+    <li class="opt versions"><a href="#">v0.11.0</a></li>
+    <li class="opt versions"><a href="#">master</a></li>
+  </ul>
+</div>
 
 <script type="text/javascript" src='../_static/js/options.js'></script>
 
@@ -52,7 +52,7 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="devices">
 <div class="btn-group opt-group" role="group">
   <button type="button" class="btn btn-default iots opt active">Raspberry Pi</button>
-  <button type="button" class="btn btn-default iots opt">NVIDIA Jetson TX2</button>
+  <button type="button" class="btn btn-default iots opt">NVIDIA Jetson</button>
 </div>
 </div>
 
@@ -1846,6 +1846,7 @@ You can also run distributed deep learning with *MXNet* on AWS using [Cloudforma
 </div> <!-- END - Cloud Python Installation Instructions -->
 
 
+<!-- DEVICES -->
 <div class="devices">
   <div class="raspberry-pi">
 
@@ -1853,9 +1854,43 @@ MXNet supports the Debian based Raspbian ARM based operating system so you can r
 
 These instructions will walk through how to build MXNet for the Raspberry Pi and install the Python bindings for the library.
 
+You can do a dockerized cross compilation build on your local machine or a native build on-device.
+
 The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory.
 
-**Install MXNet**
+**Cross compilation build (Experimental)**
+
+## Docker installation
+**Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
+
+*Note* - You can install Community Edition (CE)
+
+**Step 2** [Optional] Post installation steps to manage Docker as a non-root user.
+
+Follow the four steps in this [docker documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user) to allow managing docker containers without *sudo*.
+
+## Build
+
+The following command will build a container with dependencies and tools and then compile MXNet for
+ARMv7. The resulting artifact will be located in `build/mxnet-x.x.x-py2.py3-none-any.whl`, copy this
+file to your Raspberry Pi.
+
+```bash
+ci/build.py -p armv7
+```
+
+## Install
+
+Create a virtualenv and install the package we created previously.
+
+```bash
+virtualenv -p `which python3` mxnet_py3
+source mxnet_py3/bin/activate
+pip install mxnet-x.x.x-py2.py3-none-any.whl
+```
+
+
+**Native Build**
 
 Installing MXNet is a two-step process:
 
@@ -1874,35 +1909,46 @@ On Raspbian versions Wheezy and later, you need the following dependencies:
 
 - A C++ compiler that supports C++ 11. The C++ compiler compiles and builds MXNet source code. Supported compilers include the following:
 
-- [G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/)
+- [G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/). Make sure to use gcc 4 and not 5 or 6 as there
+  are known bugs with these compilers.
 
 Install these dependencies using the following commands in any directory:
 
 ```bash
     sudo apt-get update
-    sudo apt-get -y install git cmake build-essential g++-4.8 c++-4.8 liblapack* libblas* libopencv*
+    sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev virtualenv
 ```
 
-Clone the MXNet source code repository using the following ```git``` command in your home directory:
+Clone the MXNet source code repository using the following `git` command in your home directory:
 ```bash
     git clone https://github.com/apache/incubator-mxnet.git --recursive
     cd incubator-mxnet
 ```
 
-If you aren't processing images with MXNet on the Raspberry Pi, you can minimize the size of the compiled library by building MXNet without the Open Source Computer Vision (OpenCV) library with the following commands:
+Build:
 ```bash
-    export USE_OPENCV = 0
-    make
+    mkdir -p build && cd build
+    cmake \
+        -DUSE_SSE=OFF \
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENCV=ON \
+        -DUSE_OPENMP=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_SIGNAL_HANDLER=ON \
+        -DCMAKE_BUILD_TYPE=Release \
+        -GNinja ..
+    ninja -j1
 ```
+Some compilation units require memory close to 1GB, so it's recommended that you enable swap as
+explained below and be cautious about increasing the number of jobs when building (-j)
 
-Otherwise, you can build the complete MXNet library with the following command:
-```bash
-    make
-```
-
-Executing either of these commands start the build process, which can take up to a couple hours, and creates a file called ```libmxnet.so``` in the mxnet/lib directory.
+Executing these commands start the build process, which can take up to a couple hours, and creates a file called `libmxnet.so` in the build directory.
 
-If you are getting build errors in which the compiler is being killed, it is likely that the compiler is running out of memory (especially if you are on Raspberry Pi 1, 2 or Zero, which have less than 1GB of RAM), this can often be rectified by increasing the swapfile size on the Pi by editing the file /etc/dphys-swapfile and changing the line CONF_SWAPSIZE=100 to CONF_SWAPSIZE=1024, then running:
+If you are getting build errors in which the compiler is being killed, it is likely that the
+compiler is running out of memory (especially if you are on Raspberry Pi 1, 2 or Zero, which have
+less than 1GB of RAM), this can often be rectified by increasing the swapfile size on the Pi by
+editing the file /etc/dphys-swapfile and changing the line CONF_SWAPSIZE=100 to CONF_SWAPSIZE=1024,
+then running:
 ```bash
   sudo /etc/init.d/dphys-swapfile stop
   sudo /etc/init.d/dphys-swapfile start
@@ -1921,6 +1967,12 @@ To install Python bindings run the following commands in the MXNet directory:
 
 Note that the `-e` flag is optional. It is equivalent to `--editable` and means that if you edit the source files, these changes will be reflected in the package installed.
 
+Alternatively you can create a whl package installable with pip with the following command:
+```bash
+ci/docker/runtime_functions.sh build_wheel python/ $(realpath build)
+```
+
+
 You are now ready to run MXNet on your Raspberry Pi device. You can get started by following the tutorial on [Real-time Object Detection with MXNet On The Raspberry Pi](http://mxnet.io/tutorials/embedded/wine_detector.html).
 
 *Note - Because the complete MXNet library takes up a significant amount of the Raspberry Pi's limited RAM, when loading training data or large models into memory, you might have to turn off the GUI and terminate running processes to free RAM.*
@@ -1928,7 +1980,9 @@ You are now ready to run MXNet on your Raspberry Pi device. You can get started
 </div> <!-- End of raspberry pi -->
 
 
-<div class="nvidia-jetson-tx2">
+<div class="nvidia-jetson">
+
+# Nvidia Jetson TX family
 
 MXNet supports the Ubuntu Arch64 based operating system so you can run MXNet on NVIDIA Jetson Devices.
 
@@ -1965,7 +2019,7 @@ Install these dependencies using the following commands in any directory:
     sudo pip install graphviz jupyter
 ```
 
-Clone the MXNet source code repository using the following ```git``` command in your home directory:
+Clone the MXNet source code repository using the following `git` command in your home directory:
 ```bash
     git clone https://github.com/apache/incubator-mxnet.git --recursive
     cd incubator-mxnet
@@ -1986,7 +2040,7 @@ Now you can build the complete MXNet library with the following command:
     make -j $(nproc)
 ```
 
-Executing this command creates a file called ```libmxnet.so``` in the mxnet/lib directory.
+Executing this command creates a file called `libmxnet.so` in the mxnet/lib directory.
 
 **Step 2** Install MXNet Python Bindings
 
@@ -2022,8 +2076,8 @@ You are now ready to run MXNet on your NVIDIA Jetson TX2 device.
 # Validate MXNet Installation
 
 <div class="linux macos">
-  <div class="python">
-    <div class="cpu">
+<div class="python">
+<div class="cpu">
 
 <div class="pip build-from-source">
 
@@ -2078,16 +2132,16 @@ Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each
 array([[ 3.,  3.,  3.],
        [ 3.,  3.,  3.]], dtype=float32)
 ```
-</div>
-</div>
-</div>
+</div><!-- linux macos -->
+</div><!-- python -->
+</div><!-- cpu -->
 
 <!-- Validate Windows CPU pip install -->
 
 <div class="windows">
-  <div class="python">
-    <div class="cpu">
-      <div class="pip">
+<div class="python">
+<div class="cpu">
+<div class="pip">
 
 Run a short *MXNet* python program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
 
@@ -2108,8 +2162,8 @@ array([[ 3.,  3.,  3.],
 <!-- Mac OS GPU installation validation -->
 
 <div class="macos">
-  <div class="python">
-    <div class="gpu">
+<div class="python">
+<div class="gpu">
 
 <div class="pip virtualenv docker">
 </br>
@@ -2124,6 +2178,7 @@ Will be available soon.
 From the MXNet root directory run: `python example/image-classification/train_mnist.py --network lenet --gpus 0` to test GPU training.
 
 </div>
+
 </div>
 </div>
 </div>
@@ -2131,8 +2186,8 @@ From the MXNet root directory run: `python example/image-classification/train_mn
 <!-- Windows GPU installation validation -->
 
 <div class="windows">
-  <div class="python">
-    <div class="gpu">
+<div class="python">
+<div class="gpu">
 
 <div class="virtualenv docker">
 </br>
@@ -2147,15 +2202,16 @@ Will be available soon.
 From the MXNet root directory run: `python example/image-classification/train_mnist.py --network lenet --gpus 0` to test GPU training.
 
 </div>
-</div>
-</div>
-</div>
+
+</div><!-- windows -->
+</div><!-- python -->
+</div><!-- gpu -->
 
 <!-- Validation for GPU machines -->
 
 <div class="linux">
-  <div class="python">
-    <div class="gpu">
+<div class="python">
+<div class="gpu">
 
 <div class="pip build-from-source">
 
@@ -2210,9 +2266,9 @@ Run a short *MXNet* python program to create a 2X3 matrix of ones *a* on a *GPU*
 array([[ 3.,  3.,  3.],
        [ 3.,  3.,  3.]], dtype=float32)
 ```
-</div>
-</div>
-</div>
+</div><!-- linux -->
+</div><!-- python -->
+</div><!-- gpu -->
 
 
 
@@ -2221,8 +2277,8 @@ array([[ 3.,  3.,  3.],
 
 <!-- Linux Clean up -->
 <div class="linux">
-  <div class="python">
-    <div class="cpu">
+<div class="python">
+<div class="cpu">
 
 <div class="pip build-from-source">
 
@@ -2261,8 +2317,8 @@ root@4919c4f58cac:/# exit
 
 <!-- MacOS Clean up -->
 <div class="macos">
-  <div class="python">
-    <div class="cpu">
+<div class="python">
+<div class="cpu">
 
 <div class="pip build-from-source">
 
@@ -2357,8 +2413,8 @@ array([[ 3.,  3.,  3.],
 <!-- Example R code for CPU -->
 
 <div class="linux macos windows">
-  <div class="r">
-    <div class="cpu">
+<div class="r">
+<div class="cpu">
 
 Run a short *MXNet* R program to create a 2X3 matrix of ones, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3.
 
@@ -2384,8 +2440,8 @@ You should see the following output:
 <!-- Example R code for GPU -->
 
 <div class="linux macos windows">
-  <div class="r">
-    <div class="gpu">
+<div class="r">
+<div class="gpu">
 
 Run a short *MXNet* R program to create a 2X3 matrix of ones *a* on a *GPU*, multiply each element in the matrix by 2 followed by adding 1. We expect the output to be a 2X3 matrix with all elements being 3. We use *mx.gpu()*, to set *MXNet* context to be GPUs.
 
@@ -2411,40 +2467,42 @@ You should see the following output:
 
 
 <div class="linux">
-  <div class="scala">
-    <div class="cpu gpu">
+<div class="scala">
+
+<div class="cpu gpu">
       Run the <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/mxnet-demo">MXNet-Scala demo project</a> to validate your Maven package installation.
-    </div>
-  </div>
+</div>
 
-  <div class="julia perl cpp">
-    <div class="cpu gpu">
+</div><!-- scala -->
+
+<div class="julia perl cpp">
+<div class="cpu gpu">
 
 Will be available soon.
 
-</div>
-</div>
-</div>
+</div><!-- cpu gpu -->
+</div><!-- julia perl cpp -->
+</div><!-- linux -->
 
 <div class="macos">
-  <div class="scala">
-    <div class="cpu gpu">
+<div class="scala">
+<div class="cpu gpu">
       Run the <a href="https://github.com/apache/incubator-mxnet/tree/master/scala-package/mxnet-demo">MXNet-Scala demo project</a> to validate your Maven package installation.
-    </div>
-  </div>
-  <div class="julia perl cpp">
-    <div class="cpu gpu">
+</div><!-- cpu gpu-->
+</div><!-- scala -->
+<div class="julia perl cpp">
+<div class="cpu gpu">
 
 Will be available soon.
 
-</div>
-</div>
-</div>
+</div><!-- cpu gpu -->
+</div><!-- julia perl cpp -->
+</div><!-- macos -->
 
 <!-- Windows MXNet Installation validation -->
 <div class="windows">
-  <div class="python">
-    <div class="cpu">
+<div class="python">
+<div class="cpu">
 
 <div class="build-from-source virtualenv docker">
 <br/>
@@ -2464,19 +2522,6 @@ Will be available soon.
 </div>
 <!-- End Windows Installation validation -->
 
-<div class="devices">
-  <div class="raspberry-pi">
-
-Will be available soon.
-
-</div>
-<div class="nvidia-jetson-tx2">
-
-Will be available soon.
-
-</div>
-</div>
-
 <br/>
 <!-- Download -->
 
diff --git a/docs/mxdoc.py b/docs/mxdoc.py
index 8332ae22d48f..33f64750e816 100644
--- a/docs/mxdoc.py
+++ b/docs/mxdoc.py
@@ -23,9 +23,31 @@
 import sys
 from recommonmark import transform
 import pypandoc
-# import StringIO from io for python3 compatibility
-from io import StringIO
 import contextlib
+# Use six for Python 2 / 3 compatibility
+from six import StringIO
+from six.moves import configparser
+
+_BUILD_VER = os.getenv('BUILD_VER', 'default')
+print("Building version {}".format(_BUILD_VER))
+_DOC_SET = 'document_sets_' + _BUILD_VER
+
+parser = configparser.SafeConfigParser()
+parser.read('settings.ini')
+
+if _DOC_SET not in parser.sections():
+    _DOC_SET = 'document_sets_default'
+
+for section in [ _DOC_SET ]:
+    print("Document sets to generate:")
+    for candidate in [ 'scala_docs', 'clojure_docs', 'doxygen_docs', 'r_docs' ]:
+        print('%-12s  : %s' % (candidate, parser.get(section, candidate)))
+
+_MXNET_DOCS_BUILD_MXNET = parser.getboolean('mxnet', 'build_mxnet')
+_SCALA_DOCS = parser.getboolean(_DOC_SET, 'scala_docs')
+_CLOJURE_DOCS = parser.getboolean(_DOC_SET, 'clojure_docs')
+_DOXYGEN_DOCS = parser.getboolean(_DOC_SET,  'doxygen_docs')
+_R_DOCS = parser.getboolean(_DOC_SET, 'r_docs')
 
 # white list to evaluate the code block output, such as ['tutorials/gluon']
 _EVAL_WHILTELIST = []
@@ -72,7 +94,7 @@ def build_mxnet(app):
 def build_r_docs(app):
     """build r pdf"""
     r_root = app.builder.srcdir + '/../R-package'
-    pdf_path = root_path + '/docs/api/r/mxnet-r-reference-manual.pdf'
+    pdf_path = app.builder.srcdir + '/api/r/mxnet-r-reference-manual.pdf'
     _run_cmd('cd ' + r_root +
              '; R -e "roxygen2::roxygenize()"; R CMD Rd2pdf . --no-preview -o ' + pdf_path)
     dest_path = app.builder.outdir + '/api/r/'
@@ -383,13 +405,21 @@ def setup(app):
 
     # If MXNET_DOCS_BUILD_MXNET is set something different than 1
     # Skip the build step
-    if os.getenv('MXNET_DOCS_BUILD_MXNET', '1') == '1':
+    if os.getenv('MXNET_DOCS_BUILD_MXNET', '1') == '1' or _MXNET_DOCS_BUILD_MXNET:
+        print("Building MXNet!")
         app.connect("builder-inited", build_mxnet)
-    app.connect("builder-inited", generate_doxygen)
-    app.connect("builder-inited", build_scala_docs)
-    app.connect("builder-inited", build_clojure_docs)
-    # skipped to build r, it requires to install latex, which is kinds of too heavy
-    # app.connect("builder-inited", build_r_docs)
+    if _DOXYGEN_DOCS:
+        print("Building Doxygen!")
+        app.connect("builder-inited", generate_doxygen)
+    if _SCALA_DOCS:
+        print("Building Scala Docs!")
+        app.connect("builder-inited", build_scala_docs)
+    if _CLOJURE_DOCS:
+        print("Building Clojure Docs!")
+        app.connect("builder-inited", build_clojure_docs)
+    if _R_DOCS:
+        print("Building R Docs!")
+        app.connect("builder-inited", build_r_docs)
     app.connect('source-read', convert_table)
     app.connect('source-read', add_buttons)
     app.add_config_value('recommonmark_config', {
diff --git a/docs/settings.ini b/docs/settings.ini
new file mode 100644
index 000000000000..f999b3efde23
--- /dev/null
+++ b/docs/settings.ini
@@ -0,0 +1,68 @@
+[mxnet]
+build_mxnet = 0
+
+[document_sets_default]
+clojure_docs = 1
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 1
+
+[document_sets_1.2.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 1
+
+[document_sets_v1.2.0]
+clojure_docs = 1
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 1
+
+[document_sets_1.1.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_v1.1.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_1.0.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_v1.0.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_0.12.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_v0.12.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_0.11.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
+
+[document_sets_v0.11.0]
+clojure_docs = 0
+doxygen_docs = 1
+r_docs = 0
+scala_docs = 0
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 3f552a998382..82e8ac9e41bd 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -173,6 +173,17 @@ Select API:&nbsp;
 
 <hr>
 
+## Perl Tutorials
+
+* Getting Started
+    * [Machine learning in Perl](http://blogs.perl.org/users/sergey_kolychev/2017/02/machine-learning-in-perl.html)
+    * [Calculator and Robo-Shakespeare](http://blogs.perl.org/users/sergey_kolychev/2017/04/machine-learning-in-perl-part2-a-calculator-handwritten-digits-and-roboshakespeare.html)
+* Gluon
+    * [DCGAN](http://blogs.perl.org/users/sergey_kolychev/2017/10/machine-learning-in-perl-part3-deep-convolutional-generative-adversarial-network.html)
+    * [Image classification and Style transfer](http://blogs.perl.org/users/sergey_kolychev/2018/07/machine-learning-in-perl-kyuubi-goes-to-a-modelzoo-during-the-starry-night.html)
+
+<hr>
+
 ## Contributing Tutorials
 
 We really appreciate contributions, and tutorials are a great way to share your knowledge and help the community. After you have followed [these steps](https://github.com/apache/incubator-mxnet/tree/master/example#contributing), please submit a pull request on Github.
diff --git a/docs/tutorials/r/fiveMinutesNeuralNetwork.md b/docs/tutorials/r/fiveMinutesNeuralNetwork.md
index 9104e8f05c2f..a2ce5ecd3761 100644
--- a/docs/tutorials/r/fiveMinutesNeuralNetwork.md
+++ b/docs/tutorials/r/fiveMinutesNeuralNetwork.md
@@ -1,18 +1,21 @@
 Develop a Neural Network with MXNet in Five Minutes
 =============================================
 
-This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package.
+This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](http://mxnet.incubator.apache.org/install/index.html?platform=Linux&language=R&processor=CPU). 
 
 ## Classification
 
-
-
+ ```
+    ## Loading required package: mlbench
+ ```
  ```r
-    require(mlbench)
+    if (!require(mlbench)) {
+      install.packages('mlbench')
+    }
  ```
 
  ```
-    ## Loading required package: mlbench
+    ## Loading required package: mxnet
  ```
 
  ```r
@@ -20,8 +23,7 @@ This tutorial is designed for new users of the `mxnet` package for R. It shows h
  ```
 
  ```
-    ## Loading required package: mxnet
-    ## Loading required package: methods
+    ## Loading required datasets
  ```
 
  ```r
@@ -235,7 +237,8 @@ Currently, we have four predefined metrics: "accuracy", "rmse", "mae", and "rmsl
 
  ```r
     demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
-      res <- mean(abs(label-pred))
+      pred <- mx.nd.reshape(pred, shape = 0)
+      res <- mx.nd.mean(mx.nd.abs(label-pred))
       return(res)
     })
  ```
@@ -253,56 +256,56 @@ This is an example of the mean absolute error metric. Simply plug it into the tr
  ```
     ## Auto detect layout of input matrix, use rowmajor.
     ## Start training with 1 devices
-    ## [1] Train-mae=13.1889538083225
-    ## [2] Train-mae=9.81431959337658
-    ## [3] Train-mae=9.21576419870059
-    ## [4] Train-mae=8.38071537613869
-    ## [5] Train-mae=7.45462437611487
-    ## [6] Train-mae=6.93423301743136
-    ## [7] Train-mae=6.91432357016537
-    ## [8] Train-mae=7.02742733055105
-    ## [9] Train-mae=7.00618194618469
-    ## [10] Train-mae=6.92541576984028
-    ## [11] Train-mae=6.87530243690643
-    ## [12] Train-mae=6.84757369098564
-    ## [13] Train-mae=6.82966501611388
-    ## [14] Train-mae=6.81151759574811
-    ## [15] Train-mae=6.78394182841811
-    ## [16] Train-mae=6.75914719419347
-    ## [17] Train-mae=6.74180388773481
-    ## [18] Train-mae=6.725853071279
-    ## [19] Train-mae=6.70932178215848
-    ## [20] Train-mae=6.6928868798746
-    ## [21] Train-mae=6.6769521329138
-    ## [22] Train-mae=6.66184809505939
-    ## [23] Train-mae=6.64754504809777
-    ## [24] Train-mae=6.63358514060577
-    ## [25] Train-mae=6.62027640889088
-    ## [26] Train-mae=6.60738245232238
-    ## [27] Train-mae=6.59505546771818
-    ## [28] Train-mae=6.58346195800437
-    ## [29] Train-mae=6.57285477783945
-    ## [30] Train-mae=6.56259003960424
-    ## [31] Train-mae=6.5527790788975
-    ## [32] Train-mae=6.54353428422991
-    ## [33] Train-mae=6.5344172368447
-    ## [34] Train-mae=6.52557652526432
-    ## [35] Train-mae=6.51697905850079
-    ## [36] Train-mae=6.50847898812758
-    ## [37] Train-mae=6.50014844106303
-    ## [38] Train-mae=6.49207674844397
-    ## [39] Train-mae=6.48412070125341
-    ## [40] Train-mae=6.47650500999557
-    ## [41] Train-mae=6.46893867486053
-    ## [42] Train-mae=6.46142131653097
-    ## [43] Train-mae=6.45395035048326
-    ## [44] Train-mae=6.44652914123403
-    ## [45] Train-mae=6.43916216409869
-    ## [46] Train-mae=6.43183777381976
-    ## [47] Train-mae=6.42455544223388
-    ## [48] Train-mae=6.41731406417158
-    ## [49] Train-mae=6.41011292926139
-    ## [50] Train-mae=6.40312503493494
+    ## [1] Train-mae=14.953625731998
+    ## [2] Train-mae=11.4802955521478
+    ## [3] Train-mae=8.50700579749213
+    ## [4] Train-mae=7.30591265360514
+    ## [5] Train-mae=7.38049803839789
+    ## [6] Train-mae=7.36036252975464
+    ## [7] Train-mae=7.06519222259521
+    ## [8] Train-mae=6.9962231847975
+    ## [9] Train-mae=6.96296903822157
+    ## [10] Train-mae=6.9046172036065
+    ## [11] Train-mae=6.87867620256212
+    ## [12] Train-mae=6.85872554779053
+    ## [13] Train-mae=6.81936407089233
+    ## [14] Train-mae=6.79135354359945
+    ## [15] Train-mae=6.77438741260105
+    ## [16] Train-mae=6.75365140702989
+    ## [17] Train-mae=6.73369296391805
+    ## [18] Train-mae=6.71600982877943
+    ## [19] Train-mae=6.69932826360067
+    ## [20] Train-mae=6.6852519777086
+    ## [21] Train-mae=6.67343420452542
+    ## [22] Train-mae=6.66315894656711
+    ## [23] Train-mae=6.65314838621351
+    ## [24] Train-mae=6.64388704299927
+    ## [25] Train-mae=6.63480265935262
+    ## [26] Train-mae=6.62583245171441
+    ## [27] Train-mae=6.61697626113892
+    ## [28] Train-mae=6.60842116673787
+    ## [29] Train-mae=6.60040124257406
+    ## [30] Train-mae=6.59264140658908
+    ## [31] Train-mae=6.58551020092434
+    ## [32] Train-mae=6.57864215638902
+    ## [33] Train-mae=6.57178926467896
+    ## [34] Train-mae=6.56495311525133
+    ## [35] Train-mae=6.55813185373942
+    ## [36] Train-mae=6.5513252152337
+    ## [37] Train-mae=6.54453214009603
+    ## [38] Train-mae=6.53775374094645
+    ## [39] Train-mae=6.53098879920112
+    ## [40] Train-mae=6.52423816257053
+    ## [41] Train-mae=6.51764053768582
+    ## [42] Train-mae=6.51121346155802
+    ## [43] Train-mae=6.5047902001275
+    ## [44] Train-mae=6.49837123023139
+    ## [45] Train-mae=6.49216641320123
+    ## [46] Train-mae=6.48598252402412
+    ## [47] Train-mae=6.4798010720147
+    ## [48] Train-mae=6.47362396452162
+    ## [49] Train-mae=6.46745183732775
+    ## [50] Train-mae=6.46128723356459
  ```
 
 Congratulations! You've learned the basics for using MXNet in R. To learn how to use MXNet's advanced features, see the other tutorials.
diff --git a/example/autoencoder/model.py b/example/autoencoder/model.py
index c1b72216ef98..9b6185c9fd18 100644
--- a/example/autoencoder/model.py
+++ b/example/autoencoder/model.py
@@ -22,7 +22,7 @@
 import numpy as np
 try:
     import cPickle as pickle
-except ModuleNotFoundError:
+except ImportError:
     import pickle
 
 
diff --git a/example/deep-embedded-clustering/model.py b/example/deep-embedded-clustering/model.py
index 777634e3cf88..9b6185c9fd18 100644
--- a/example/deep-embedded-clustering/model.py
+++ b/example/deep-embedded-clustering/model.py
@@ -22,7 +22,7 @@
 import numpy as np
 try:
     import cPickle as pickle
-except ModuleNotFoundError:
+except ImportError:
     import pickle
 
 
@@ -75,4 +75,4 @@ def load(self, fname):
                     self.args[key][:] = v
 
     def setup(self, *args, **kwargs):
-        raise NotImplementedError("must override this")
\ No newline at end of file
+        raise NotImplementedError("must override this")
diff --git a/example/fcn-xs/image_segmentaion.py b/example/fcn-xs/image_segmentaion.py
index 75df2d128a22..562db14d36be 100644
--- a/example/fcn-xs/image_segmentaion.py
+++ b/example/fcn-xs/image_segmentaion.py
@@ -93,8 +93,8 @@ def main():
     model_prefix = "FCN8s_VGG16"
     epoch = 19
 
-    # By default, MXNet will run on the CPU. Uncomment the line below to execute on the GPU
-    # ctx = mx.gpu()
+    # By default, MXNet will run on the CPU. Change to ctx = mx.gpu() to run on GPU.
+    ctx = mx.cpu()
 
     fcnxs, fcnxs_args, fcnxs_auxs = mx.model.load_checkpoint(model_prefix, epoch)
     fcnxs_args["data"] = mx.nd.array(get_data(args.input), ctx)
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index fe0a346f42dd..44a2afea3681 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -30,7 +30,8 @@
 from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
 import numpy as np
 
-from data import *
+from data import (get_cifar10_iterator, get_imagenet_iterator,
+                  get_caltech101_iterator, dummy_iterator)
 
 # logging
 logging.basicConfig(level=logging.INFO)
diff --git a/example/neural-style/end_to_end/model_vgg19.py b/example/neural-style/end_to_end/model_vgg19.py
index 0d369ae08f58..1bc38766beb4 100644
--- a/example/neural-style/end_to_end/model_vgg19.py
+++ b/example/neural-style/end_to_end/model_vgg19.py
@@ -90,6 +90,7 @@ def get_executor_with_style(style, content, input_size, ctx):
                         arg_dict=arg_dict)
 
 def get_executor_content(content, input_size, ctx):
+    out = mx.sym.Group([content])
     arg_shapes, output_shapes, aux_shapes = content.infer_shape(data=(1, 3, input_size[0], input_size[1]))
     arg_names = out.list_arguments()
     arg_dict = dict(zip(arg_names, [mx.nd.zeros(shape, ctx=ctx) for shape in arg_shapes]))
diff --git a/example/profiler/README.md b/example/profiler/README.md
index 7d3c42b629d9..1b9279ccf227 100644
--- a/example/profiler/README.md
+++ b/example/profiler/README.md
@@ -5,8 +5,12 @@ Please refer to [this link](http://mxnet.incubator.apache.org/faq/perf.html?high
 for visualizing profiling results and make sure that you have installed a version of MXNet compiled
 with `USE_PROFILER=1`.
 
-- profiler_executor.py. To run this example, simply type `python profiler_executor.py` in terminal.
-It will generate a json file named `profile_executor_5iter.json`.
+- profiler_executor.py. To run this example,
+    - clone mxnet-memonger (git clone https://github.com/dmlc/mxnet-memonger.git).
+    - Add mxnet-memonger folder to PYTHONPATH.
+    export PYTHONPATH=$PYTHONPATH:/path/to/mxnet-memonger
+    - type python profiler_executor.py in terminal.
+    It will generate a json file named `profile_executor_5iter.json`.
 
 - profiler_imageiter.py. You first need to create a file named `test.rec`,
 which is an image dataset file before running this example.
@@ -20,4 +24,4 @@ that you have installed a GPU enabled version of MXNet before running this examp
 `python profiler_matmul.py` and it will generate `profile_matmul_20iter.json`.
 
 - profiler_ndarray.py. This examples profiles a series of `NDArray` operations. Simply type
-`python profiler_ndarray.py` in terminal and it will generate `profile_ndarray.json`.
\ No newline at end of file
+`python profiler_ndarray.py` in terminal and it will generate `profile_ndarray.json`.
diff --git a/example/profiler/profiler_executor.py b/example/profiler/profiler_executor.py
index 8ab417a97442..91532535bd05 100644
--- a/example/profiler/profiler_executor.py
+++ b/example/profiler/profiler_executor.py
@@ -21,6 +21,7 @@
 import time
 import numpy as np
 from mxnet import profiler
+import memonger
 
 
 def parse_args():
@@ -86,7 +87,8 @@ def get_symbol():
 
 def get_module(ctx, sym, provide_data, provide_label, batch_size=None, is_train=True, use_memonger=False):
     if use_memonger:
-        sym = search_plan(sym, data=data_shapes)
+        name, data_shapes = provide_data[0]
+        sym = memonger.search_plan(sym, data=data_shapes)
     mod = mx.mod.Module(symbol=sym,
                         data_names=[name for name, _ in provide_data],
                         label_names=[name for name, _ in provide_label],
diff --git a/example/reinforcement-learning/a3c/a3c.py b/example/reinforcement-learning/a3c/a3c.py
index f74ce77b652d..c100f61304d7 100644
--- a/example/reinforcement-learning/a3c/a3c.py
+++ b/example/reinforcement-learning/a3c/a3c.py
@@ -203,7 +203,7 @@ def test():
         mx.gpu(int(i)) for i in args.gpus.split(',')]
 
     # module
-    dataiter = robo_data.RobosimsDataIter('scenes', args.batch_size, args.input_length, web_viz=True)
+    dataiter = rl_data.GymDataIter('scenes', args.batch_size, args.input_length, web_viz=True)
     print(dataiter.provide_data)
     net = sym.get_symbol_thor(dataiter.act_dim)
     module = mx.mod.Module(net, data_names=[d[0] for d in dataiter.provide_data], label_names=('policy_label', 'value_label'), context=devs)
diff --git a/example/reinforcement-learning/dqn/atari_game.py b/example/reinforcement-learning/dqn/atari_game.py
index 43c298a73891..96de65f9047b 100644
--- a/example/reinforcement-learning/dqn/atari_game.py
+++ b/example/reinforcement-learning/dqn/atari_game.py
@@ -22,7 +22,7 @@
 import cv2
 import logging
 import os
-from utils import *
+from utils import get_numpy_rng
 from replay_memory import ReplayMemory
 from game import Game
 from game import DEFAULT_MAX_EPISODE_STEP
diff --git a/example/reinforcement-learning/dqn/base.py b/example/reinforcement-learning/dqn/base.py
index f3cd962ef5bf..bd78b4b6ac51 100644
--- a/example/reinforcement-learning/dqn/base.py
+++ b/example/reinforcement-learning/dqn/base.py
@@ -23,8 +23,9 @@
 import os
 import pickle
 from collections import OrderedDict
+from utils import (get_bucket_key, save_params,
+                   save_misc, load_params)
 import logging
-from utils import *
 
 logger = logging.getLogger(__name__)
 
diff --git a/example/reinforcement-learning/dqn/dqn_demo.py b/example/reinforcement-learning/dqn/dqn_demo.py
index aef44f87ebf4..7462bb67b42a 100755
--- a/example/reinforcement-learning/dqn/dqn_demo.py
+++ b/example/reinforcement-learning/dqn/dqn_demo.py
@@ -23,9 +23,11 @@
 from base import Base
 from operators import *
 from atari_game import AtariGame
-from utils import *
+from utils import get_numpy_rng, parse_ctx
 import logging
 import argparse
+import sys
+import time
 
 root = logging.getLogger()
 root.setLevel(logging.DEBUG)
diff --git a/example/reinforcement-learning/dqn/dqn_run_test.py b/example/reinforcement-learning/dqn/dqn_run_test.py
index e8f36b979767..1e02f3ae860c 100755
--- a/example/reinforcement-learning/dqn/dqn_run_test.py
+++ b/example/reinforcement-learning/dqn/dqn_run_test.py
@@ -28,7 +28,7 @@
 import sys
 from base import Base
 from atari_game import AtariGame
-from utils import *
+from utils import get_numpy_rng
 from operators import *
 
 root = logging.getLogger()
diff --git a/example/reinforcement-learning/dqn/replay_memory.py b/example/reinforcement-learning/dqn/replay_memory.py
index 02691a01888a..af3efea0ba54 100644
--- a/example/reinforcement-learning/dqn/replay_memory.py
+++ b/example/reinforcement-learning/dqn/replay_memory.py
@@ -21,7 +21,7 @@
 import mxnet.ndarray as nd
 import numpy
 import copy
-from utils import *
+from utils import get_numpy_rng
 
 
 class ReplayMemory(object):
diff --git a/example/rnn/word_lm/train.py b/example/rnn/word_lm/train.py
index aa641358cf90..c48fe800ce9a 100644
--- a/example/rnn/word_lm/train.py
+++ b/example/rnn/word_lm/train.py
@@ -20,7 +20,7 @@
 import argparse, math
 import logging
 from data import Corpus, CorpusIter
-from model import *
+from model import rnn, softmax_ce_loss
 from module import *
 from mxnet.model import BatchEndParam
 
diff --git a/example/sparse/factorization_machine/metric.py b/example/sparse/factorization_machine/metric.py
index 05ef04a0c48e..a8c52c781c0f 100644
--- a/example/sparse/factorization_machine/metric.py
+++ b/example/sparse/factorization_machine/metric.py
@@ -107,7 +107,9 @@ def update(self, labels, preds):
         label_sum = label_weight.sum()
         if label_sum == 0 or label_sum == label_weight.size:
             raise Exception("AUC with one class is undefined")
-            
+
+        label_one_num = np.count_nonzero(label_weight)
+        label_zero_num = len(label_weight) - label_one_num
         total_area = label_zero_num * label_one_num
         height = 0
         width = 0
diff --git a/example/sparse/factorization_machine/train.py b/example/sparse/factorization_machine/train.py
index c73ce88d41c2..b30f9cc81acf 100644
--- a/example/sparse/factorization_machine/train.py
+++ b/example/sparse/factorization_machine/train.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 from metric import *
 from mxnet.test_utils import *
-from model import *
+from model import factorization_machine_model
 import argparse, os
 
 parser = argparse.ArgumentParser(description="Run factorization machine with criteo dataset",
diff --git a/example/sparse/wide_deep/train.py b/example/sparse/wide_deep/train.py
index 89befb5aa82d..6fd81b7fa480 100644
--- a/example/sparse/wide_deep/train.py
+++ b/example/sparse/wide_deep/train.py
@@ -17,8 +17,8 @@
 
 import mxnet as mx
 from mxnet.test_utils import *
-from data import *
-from model import *
+from data import get_uci_adult
+from model import wide_deep_model
 import argparse
 import os
 
diff --git a/example/ssd/dataset/pycocotools/coco.py b/example/ssd/dataset/pycocotools/coco.py
index 19a7b8b7f64d..470f086f0b02 100755
--- a/example/ssd/dataset/pycocotools/coco.py
+++ b/example/ssd/dataset/pycocotools/coco.py
@@ -255,22 +255,7 @@ def showAnns(self, anns):
                             color.append(c)
                     else:
                         # mask
-                        t = self.imgs[ann['image_id']]
-                        if type(ann['segmentation']['counts']) == list:
-                            # rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
-                            raise NotImplementedError("maskUtils disabled!")
-                        else:
-                            rle = [ann['segmentation']]
-                        # m = maskUtils.decode(rle)
                         raise NotImplementedError("maskUtils disabled!")
-                        img = np.ones( (m.shape[0], m.shape[1], 3) )
-                        if ann['iscrowd'] == 1:
-                            color_mask = np.array([2.0,166.0,101.0])/255
-                        if ann['iscrowd'] == 0:
-                            color_mask = np.random.random((1, 3)).tolist()[0]
-                        for i in range(3):
-                            img[:,:,i] = color_mask[i]
-                        ax.imshow(np.dstack( (img, m*0.5) ))
                 if 'keypoints' in ann and type(ann['keypoints']) == list:
                     # turn skeleton into zero-based index
                     sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
@@ -430,6 +415,4 @@ def annToMask(self, ann):
         :return: binary mask (numpy 2D array)
         """
         rle = self.annToRLE(ann)
-        # m = maskUtils.decode(rle)
         raise NotImplementedError("maskUtils disabled!")
-        return m
diff --git a/example/ssd/symbol/common.py b/example/ssd/symbol/common.py
index 4a0458f87288..a2fb4e69d18c 100644
--- a/example/ssd/symbol/common.py
+++ b/example/ssd/symbol/common.py
@@ -206,6 +206,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
          assert sizes[0] > 0 and sizes[0] < 1
          assert sizes[1] > 0 and sizes[1] < 1 and sizes[1] > sizes[0]
          tmp = np.linspace(sizes[0], sizes[1], num=(len(from_layers)-1))
+         # Ref for start_offset value:
+         # https://arxiv.org/abs/1512.02325
+         start_offset = 0.1
          min_sizes = [start_offset] + tmp.tolist()
          max_sizes = tmp.tolist() + [tmp[-1]+start_offset]
          sizes = zip(min_sizes, max_sizes)
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index a652fe5b7073..75784a391b47 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -222,6 +222,14 @@ struct Context {
    * \return The number of GPUs that are available.
    */
   inline static int32_t GetGPUCount();
+  /*!
+   * \brief get the free and total available memory on a GPU
+   * \param dev the GPU number to query
+   * \param free_mem pointer to the integer holding free GPU memory
+   * \param total_mem pointer to the integer holding total GPU memory
+   * \return No return value
+   */
+  inline static void GetGPUMemoryInformation(int dev, int *free, int *total);
   /*!
    * Create a pinned CPU context.
    * \param dev_id the device id for corresponding GPU.
@@ -326,6 +334,35 @@ inline int32_t Context::GetGPUCount() {
 #endif
 }
 
+inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
+                                             int *total_mem) {
+#if MXNET_USE_CUDA
+
+  size_t memF, memT;
+  cudaError_t e;
+
+  int curDevice;
+  e = cudaGetDevice(&curDevice);
+  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
+
+  e = cudaSetDevice(dev);
+  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
+
+  e = cudaMemGetInfo(&memF, &memT);
+  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
+
+  e = cudaSetDevice(curDevice);
+  CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
+
+  *free_mem = static_cast<int>(memF);
+  *total_mem = static_cast<int>(memT);
+
+#else
+  LOG(FATAL)
+      << "This call is only supported for MXNet built with CUDA support.";
+#endif
+}
+
 inline Context Context::FromString(const std::string& str) {
   Context ret;
   try {
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 6bbe9dfe8f0a..00439962a944 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -437,6 +437,15 @@ MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
  */
 MXNET_DLL int MXGetGPUCount(int* out);
 
+/*!
+ * \brief get the free and total available memory on a GPU
+ * \param dev the GPU number to query
+ * \param free_mem pointer to the integer holding free GPU memory
+ * \param total_mem pointer to the integer holding total GPU memory
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);
+
 /*!
  * \brief get the MXNet library version as an integer
  * \param pointer to the integer holding the version number
@@ -1761,6 +1770,13 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
                                 NDArrayHandle** aux_states,
                                 ExecutorHandle shared_exec,
                                 ExecutorHandle *out);
+
+/*!
+ * \brief get optimized graph from graph executor
+ */
+MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
+                                           SymbolHandle *out);
+
 /*!
  * \brief set a call back to notify the completion of operation
  */
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 842653f86537..0ab04b86a0a1 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -166,6 +166,7 @@ class Executor {
                               std::unordered_map<std::string, NDArray>*
                                 shared_data_arrays = nullptr,
                               Executor* shared_exec = nullptr);
+
   /*!
    * \brief the prototype of user-defined monitor callback
    */
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Changes b/perl-package/AI-MXNet-Gluon-Contrib/Changes
index 7b3b27a3722c..81e55aa753ab 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/Changes
+++ b/perl-package/AI-MXNet-Gluon-Contrib/Changes
@@ -1,5 +1,11 @@
 Revision history for Perl extension AI::MXNet::Gluon::Contrib
 
+1.32    Sun Jul 15 12:12:15 PDT 2018
+        - Missing POD fixes.
+
+1.31    Sat Jul 14 08:33:21 PDT 2018
+        - Fixed CPAN indexing issue.
+
 1.3     Tue Jul 10 21:19:13 PDT 2018
         - Initial release
 
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.json b/perl-package/AI-MXNet-Gluon-Contrib/META.json
index 52c32309879d..ec65bb01348e 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.json
+++ b/perl-package/AI-MXNet-Gluon-Contrib/META.json
@@ -38,5 +38,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.3"
+   "version" : "1.32"
 }
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.yml b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
index b059f0f51bfd..aaa194debae9 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.yml
+++ b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
@@ -18,4 +18,4 @@ no_index:
     - inc
 requires:
   AI::MXNet: '1.31'
-version: '1.3'
+version: '1.32'
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL b/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
index b27d59b3b558..6c58d6ea8669 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
+++ b/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
@@ -39,7 +39,7 @@ my %WriteMakefileArgs = (
     "AI::MXNet" => "1.31",
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.3",
+  "VERSION" => "1.32",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/README b/perl-package/AI-MXNet-Gluon-Contrib/README
index 1481c3e66c1a..6c0efcc3c897 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/README
+++ b/perl-package/AI-MXNet-Gluon-Contrib/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet-Gluon-Contrib,
-version 1.3:
+version 1.32:
 
   Perl interface to MXNet Gluon Contib modules, a collection of supplemental Gluon blocks.
 
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
index f88fb8a7b595..029bc4b65a68 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
+++ b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
@@ -15,10 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
+package AI::MXNet::Gluon::Contrib;
 use strict;
 use warnings;
 use AI::MXNet;
 use AI::MXNet::Gluon::Contrib::NN::BasicLayers;
-our $VERSION = '1.3';
+our $VERSION = '1.32';
+=head1 NAME 
+
+    AI::MXNet::Gluon::Contrib - A collection of supplemental Gluon blocks.
+=cut
 
 1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
index 455284e30483..5f57e031032c 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
+++ b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
@@ -17,6 +17,13 @@
 
 use strict;
 use warnings;
+package AI::MXNet::Gluon::Contrib::NN::BasicLayers;
+
+=head1 NAME 
+
+    AI::MXNet::Gluon::Contrib::NN::BasicLayers - An additional collection of Gluon's building blocks.
+=cut
+
 use AI::MXNet::Function::Parameters;
 package AI::MXNet::Gluon::NN::Concurrent;
 use AI::MXNet::Gluon::Mouse;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes b/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
index c233f92458db..377dff5be8de 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNet::Gluon::ModelZoo
 
+1.32    Sun Aug  5 14:25:31 PDT 2018
+        - Updated vgg16/19 models
+
 1.3     Tue Jul 10 21:19:13 PDT 2018
         - Initial release
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json b/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
index c0e1ad3af8ae..9ea969e9f5fb 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
@@ -39,5 +39,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.3"
+   "version" : "1.32"
 }
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
index 2493af60bbec..a04484a898a9 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
@@ -19,4 +19,4 @@ no_index:
 requires:
   AI::MXNet: '1.31'
   AI::MXNet::Gluon::Contrib: '1.3'
-version: '1.3'
+version: '1.32'
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL b/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
index 8427aef3dbc4..d15dfce99b8e 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
@@ -40,7 +40,7 @@ my %WriteMakefileArgs = (
     "AI::MXNet::Gluon::Contrib" => "1.3"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.3",
+  "VERSION" => "1.32",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/README b/perl-package/AI-MXNet-Gluon-ModelZoo/README
index d6d697292dbe..6b8e04b971ec 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/README
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet-Gluon-ModelZoo,
-version 1.3:
+version 1.32:
 
   Perl interface to MXNet Gluon ModelZoo, a collection of pretrained machine learning models for computer vision.
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
index 64ccd4601cf4..c9e6e7753045 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
@@ -26,7 +26,7 @@ use AI::MXNet::Gluon::ModelZoo::Vision;
 use Exporter;
 use base qw(Exporter);
 @AI::MXNet::Gluon::ModelZoo::EXPORT_OK = qw(get_model);
-our $VERSION = '1.3';
+our $VERSION = '1.32';
 
 =head1 NAME
 
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
index 9269ee735663..bb258b4d9cdf 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
@@ -60,10 +60,10 @@ my %_model_sha1 = map { $_->[1] => $_->[0] } (
     ['ee79a8098a91fbe05b7a973fed2017a6117723a8', 'vgg11_bn'],
     ['6bc5de58a05a5e2e7f493e2d75a580d83efde38c', 'vgg13'],
     ['7d97a06c3c7a1aecc88b6e7385c2b373a249e95e', 'vgg13_bn'],
-    ['649467530119c0f78c4859999e264e7bf14471a9', 'vgg16'],
-    ['6b9dbe6194e5bfed30fd7a7c9a71f7e5a276cb14', 'vgg16_bn'],
-    ['f713436691eee9a20d70a145ce0d53ed24bf7399', 'vgg19'],
-    ['9730961c9cea43fd7eeefb00d792e386c45847d6', 'vgg19_bn']
+    ['e660d4569ccb679ec68f1fd3cce07a387252a90a', 'vgg16'],
+    ['7f01cf050d357127a73826045c245041b0df7363', 'vgg16_bn'],
+    ['ad2f660d101905472b83590b59708b71ea22b2e5', 'vgg19'],
+    ['f360b758e856f1074a85abd5fd873ed1d98297c3', 'vgg19_bn']
 );
 
 my $apache_repo_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/';
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
index b522759529a0..8b9463ee84e8 100644
--- a/perl-package/AI-MXNet/Changes
+++ b/perl-package/AI-MXNet/Changes
@@ -1,4 +1,10 @@
 Revision history for Perl extension AI::MXNet
+
+1.32    Sun Aug  5 14:25:31 PDT 2018
+        - Several new metric classes
+        - Expanded documentation
+        - Bugfixes.
+
 1.31    Tue Jul 10 21:19:13 PDT 2018
         - Memory leak fix for Gluon API
         - Added summary function for Gluon models
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
index a43f77d3662a..7d0ab96c0593 100644
--- a/perl-package/AI-MXNet/META.json
+++ b/perl-package/AI-MXNet/META.json
@@ -30,7 +30,7 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNetCAPI" : "1.3",
+            "AI::MXNetCAPI" : "1.32",
             "AI::NNVMCAPI" : "1.3",
             "Function::Parameters" : "1.0705",
             "Hash::Ordered" : "0.012",
@@ -45,5 +45,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.31"
+   "version" : "1.32"
 }
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index 642f370ee81f..ee5d677a8139 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -17,7 +17,7 @@ no_index:
     - t
     - inc
 requires:
-  AI::MXNetCAPI: '1.3'
+  AI::MXNetCAPI: '1.32'
   AI::NNVMCAPI: '1.3'
   Function::Parameters: '1.0705'
   Hash::Ordered: '0.012'
@@ -25,4 +25,4 @@ requires:
   Mouse: v2.1.0
   PDL: '2.007'
   PDL::CCS: '1.23.4'
-version: '1.31'
+version: '1.32'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
index f8f0d9c63fe0..59036d905f82 100644
--- a/perl-package/AI-MXNet/Makefile.PL
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -46,7 +46,7 @@ my %WriteMakefileArgs = (
     "GraphViz" => "2.14"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.31",
+  "VERSION" => "1.32",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
index e34970a79723..2f1010a43f9a 100644
--- a/perl-package/AI-MXNet/README
+++ b/perl-package/AI-MXNet/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet,
-version 1.31:
+version 1.32:
 
   Perl interface to MXNet machine learning library
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 4e40fd7298b2..651ca92ad69a 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -51,7 +51,7 @@ use AI::MXNet::Gluon;
 use AI::MXNet::NDArray::Sparse;
 use AI::MXNet::Symbol::Sparse;
 use AI::MXNet::Engine;
-our $VERSION = '1.31';
+our $VERSION = '1.32';
 
 sub import
 {
@@ -132,7 +132,7 @@ AI::MXNet - Perl interface to MXNet machine learning library
 
     ## Convolutional NN for recognizing hand-written digits in MNIST dataset
     ## It's considered "Hello, World" for Neural Networks
-    ## For more info about the MNIST problem please refer to http://neuralnetworksanddeeplearning.com/chap1.html
+    ## For more info about the MNIST problem please refer to L<http://neuralnetworksanddeeplearning.com/chap1.html>
 
     use strict;
     use warnings;
@@ -187,9 +187,104 @@ AI::MXNet - Perl interface to MXNet machine learning library
     my $res = $mod->score($val_dataiter, mx->metric->create('acc'));
     ok($res->{accuracy} > 0.8);
 
+    ## Gluon MNIST example
+
+    my $net = nn->Sequential();
+    $net->name_scope(sub {
+        $net->add(nn->Dense(128, activation=>'relu'));
+        $net->add(nn->Dense(64, activation=>'relu'));
+        $net->add(nn->Dense(10));
+    });
+    $net->hybridize;
+
+    # data
+    sub transformer
+    {
+        my ($data, $label) = @_;
+        $data = $data->reshape([-1])->astype('float32')/255;
+        return ($data, $label);
+    }
+    my $train_data = gluon->data->DataLoader(
+        gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
+        batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
+    );
+
+    ## training
+    sub train
+    {
+        my ($epochs, $ctx) = @_;
+        # Collect all parameters from net and its children, then initialize them.
+        $net->initialize(mx->init->Xavier(magnitude=>2.24), ctx=>$ctx);
+        # Trainer is for updating parameters with gradient.
+        my $trainer = gluon->Trainer($net->collect_params(), 'sgd', { learning_rate => $lr, momentum => $momentum });
+        my $metric = mx->metric->Accuracy();
+        my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
+
+        for my $epoch (0..$epochs-1)
+        {
+            # reset data iterator and metric at begining of epoch.
+            $metric->reset();
+            enumerate(sub {
+                my ($i, $d) = @_;
+                my ($data, $label) = @$d;
+                $data = $data->as_in_context($ctx);
+                $label = $label->as_in_context($ctx);
+                # Start recording computation graph with record() section.
+                # Recorded graphs can then be differentiated with backward.
+                my $output;
+                autograd->record(sub {
+                    $output = $net->($data);
+                    my $L = $loss->($output, $label);
+                    $L->backward;
+                });
+                # take a gradient step with batch_size equal to data.shape[0]
+                $trainer->step($data->shape->[0]);
+                # update metric at last.
+                $metric->update([$label], [$output]);
+
+                if($i % $log_interval == 0 and $i > 0)
+                {
+                    my ($name, $acc) = $metric->get();
+                    print "[Epoch $epoch Batch $i] Training: $name=$acc\n";
+                }
+            }, \@{ $train_data });
+
+            my ($name, $acc) = $metric->get();
+            print "[Epoch $epoch] Training: $name=$acc\n";
+
+            my ($val_name, $val_acc) = test($ctx);
+            print "[Epoch $epoch] Validation: $val_name=$val_acc\n"
+        }
+        $net->save_parameters('mnist.params');
+    }
+
+    train($epochs, $cuda ? mx->gpu(0) : mx->cpu);
+
 =head1 DESCRIPTION
 
     Perl interface to MXNet machine learning library.
+    MXNet supports the Perl programming language. 
+    The MXNet Perl package brings flexible and efficient GPU computing and 
+    state-of-art deep learning to Perl.
+    It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
+    It also lets you construct and customize the state-of-art deep learning models in Perl,
+    and apply them to tasks, such as image classification and data science challenges.
+
+    One important thing to internalize is that Perl interface is written to be as close as possible to the Python’s API,
+    so most, if not all of Python’s documentation and examples should just work in Perl after making few changes 
+    in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ; 
+    and in 99% of cases that’s all that is needed there.
+    In addition please refer to very detailed L<MXNet Python API Documentation|http://mxnet.io/api/python/index.html>.
+
+    AI::MXNet supports new imperative PyTorch like Gluon MXNet interface.
+    Please get acquainted with this new interface at L<Deep Learning - The Straight Dope|https://gluon.mxnet.io/>.
+
+    For specific Perl Gluon usage please refer to Perl examples and tests directories on github,
+    but be assured that the Python and Perl usage are extremely close in order to make the use 
+    of the Python Gluon docs and examples as easy as possible.
+
+    AI::MXNet is seamlessly glued with L<PDL|https://metacpan.org/pod/PDL>, the C++ level state can be easily initialized from PDL
+    and the results can be transferred to PDL objects in order to allow you to use all the glory and power of the PDL!
 
 =head1 BUGS AND INCOMPATIBILITIES
 
@@ -198,9 +293,9 @@ AI::MXNet - Perl interface to MXNet machine learning library
 
 =head1 SEE ALSO
 
-    http://mxnet.io/
-    https://github.com/dmlc/mxnet/tree/master/perl-package
-    Function::Parameters, Mouse
+    L<http://mxnet.io/>
+    L<https://github.com/dmlc/mxnet/tree/master/perl-package>
+    L<Function::Parameters|https://metacpan.org/pod/Function::Parameters>, L<Mouse|https://metacpan.org/pod/Mouse>
 
 =head1 AUTHOR
 
@@ -208,6 +303,6 @@ AI::MXNet - Perl interface to MXNet machine learning library
 
 =head1 COPYRIGHT & LICENSE
 
-    This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
+    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
 
 =cut
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
index 160cd968f95b..c1e5f06e12bd 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
@@ -46,16 +46,38 @@ EOP
     AI::MXNet::AutoGrad - Autograd for NDArray.
 =cut
 
+=head1 DESCRIPTION
+
+    Auto gradients differentiation for dynamic graphs, primarily used with Gluon.
+
+=cut
+
+=head1 SYNOPSIS
+
+    use AI::MXNet qw(mx);
+    my $x = mx->nd->ones([1]);
+    $x->attach_grad;
+    my $z;
+    mx->autograd->record(sub {
+        $z = mx->nd->elemwise_add($x->exp, $x);
+    });
+    my $dx = mx->autograd->grad($z, $x, create_graph=>1);
+    ok(abs($dx->asscalar - 3.71828175) < 1e-7);
+    $dx->backward;
+    ok(abs($x->grad->asscalar - 2.71828175) < 1e-7);
+
+=cut
+
 =head2 set_is_training
 
     Set status to training/not training. When training, graph will be constructed
-    for gradient computation. Operators will also run with ctx.is_train=True. For example,
+    for gradient computation. Operators will also run with $is_train=1. For example,
     Dropout will drop inputs randomly when is_train=True while simply passing through
-    if is_train=False.
+    if $is_train=0.
 
     Parameters
     ----------
-    is_train: bool
+    $is_train: Bool
 
     Returns
     -------
@@ -75,7 +97,7 @@ method set_is_training(Bool $is_train)
 
     Parameters
     ----------
-    is_recoding: bool
+    $is_recoding: Bool
 
     Returns
     -------
@@ -163,9 +185,9 @@ method mark_variables(
         Output NDArray(s)
     :$head_grads=: Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]]
         Gradients with respect to heads.
-    :$retain_graph=0: bool, optional
+    :$retain_graph=0: Bool, optional
         Whether to retain graph.
-    :$train_mode=1: bool, optional
+    :$train_mode=1: Bool, optional
         Whether to do backward for training or predicting.
 =cut
 method backward(
@@ -196,11 +218,11 @@ method backward(
 
     Parameters
     ----------
-    outputs: array ref of NDArray
+    outputs: ArrayRef[AI::MXNet::NDArray]
 
     Returns
     -------
-    gradients: array ref of NDArray
+    gradients: ArrayRef[AI::MXNet::NDArray]
 =cut
 
 
@@ -215,14 +237,14 @@ method compute_gradient(ArrayRef[AI::MXNet::NDArray] $outputs)
 
     Parameters
     ----------
-    func: a perl sub
+    $func: CodeRef
         The forward (loss) function.
-    argnum: an int or a array ref of int
+    $argnum: Maybe[Int|ArrayRef[Int]]
         The index of argument to calculate gradient for.
 
     Returns
     -------
-    grad_and_loss_func: a perl sub
+    grad_and_loss_func: CodeRef
         A function that would compute both the gradient of arguments and loss value.
 =cut
 
@@ -256,29 +278,29 @@ method grad_and_loss(CodeRef $func, Maybe[Int|ArrayRef[Int]] $argnum=)
     returned as new NDArrays instead of stored into `variable.grad`.
     Supports recording gradient graph for computing higher order gradients.
 
-    .. Note: Currently only a very limited set of operators support higher order
+    Note: Currently only a very limited set of operators support higher order
     gradients.
 
     Parameters
     ----------
-    $heads: NDArray or array ref of NDArray
+    $heads: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]
         Output NDArray(s)
-    $variables: NDArray or list of NDArray
+    $variables: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]
         Input variables to compute gradients for.
-    :$head_grads=: NDArray or list of NDArray or undef
+    :$head_grads=: Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]]
         Gradients with respect to heads.
-    :$retain_graph=: bool
+    :$retain_graph=: Bool
         Whether to keep computation graph to differentiate again, instead
         of clearing history and release memory. Defaults to the same value
         as create_graph.
-    :$create_graph=0: bool
-        Whether to record gradient graph for computing higher order
-    $train_mode=1: bool, optional
+    :$create_graph=0: Bool
+        Whether to record gradient graph for computing of higher order gradients.
+    $train_mode=1: Bool, optional
         Whether to do backward for training or prediction.
 
     Returns
     -------
-    NDArray or list of NDArray:
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]:
         Gradients with respect to variables.
 
     Examples
@@ -349,7 +371,7 @@ method grad(
     Executes $sub within an autograd training scope context.
     Parameters
     ----------
-    CodeRef $sub: a perl sub
+    $sub: CodeRef
 =cut
 
 method train_mode(CodeRef $sub)
@@ -365,7 +387,7 @@ method train_mode(CodeRef $sub)
     Executes $sub within an autograd predicting scope context.
     Parameters
     ----------
-    CodeRef $sub: a perl sub
+    $sub: CodeRef
 =cut
 
 method predict_mode(CodeRef $sub)
@@ -382,8 +404,8 @@ method predict_mode(CodeRef $sub)
     and captures code that needs gradients to be calculated.
     Parameters
     ----------
-    CodeRef $sub: a perl sub
-    Maybe[Bool] :$train_mode=1
+    $sub: CodeRef
+    :$train_mode=1 : Maybe[Bool]
 =cut
 
 method record(CodeRef $sub, Maybe[Bool] :$train_mode=1)
@@ -409,8 +431,8 @@ method record(CodeRef $sub, Maybe[Bool] :$train_mode=1)
     and captures code that needs gradients to be calculated.
     Parameters
     ----------
-    CodeRef $sub: a perl sub
-    Maybe[Bool] :$train_mode=0
+    $sub: CodeRef
+    :$train_mode=0 : Maybe[Bool]
 =cut
 
 method pause(CodeRef $sub, Maybe[Bool] :$train_mode=0)
@@ -436,11 +458,11 @@ method pause(CodeRef $sub, Maybe[Bool] :$train_mode=0)
 
     Parameters
     ----------
-    x : NDArray
-        Array representing the head of computation graph.
+    $x : AI::MXNet::NDArray
+        AI::MXNet::NDArray representing the head of computation graph.
     Returns
     -------
-    Symbol
+    AI::MXNet::Symbol
         The retrieved Symbol.
 =cut
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
index f7daea2a7871..3f6bd8341325 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
@@ -21,7 +21,7 @@ use warnings;
 use PDL;
 use PDL::Types ();
 use PDL::CCS::Nd;
-use AI::MXNetCAPI 1.3;
+use AI::MXNetCAPI 1.32;
 use AI::NNVMCAPI 1.3;
 use AI::MXNet::Types;
 use Time::HiRes;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm b/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm
index 27ec6dc0d2a5..7e73ded8ad07 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm
@@ -22,6 +22,11 @@ package AI::MXNet::CachedOp;
     AI::MXNet::CachedOp - A wrapper around CachedOpHandle
 =cut
 
+=head1 DESCRIPTION
+
+    Internal module, used as a part of AI::MXNet::Gluon::HybridBlock.
+=cut
+
 use strict;
 use warnings;
 use AI::MXNet::Base;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
index da3309700394..b2a0b2948154 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
@@ -25,7 +25,38 @@ use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } };
 
 =head1 NAME
 
-    AI::MXNet::Callback - A collection of predefined callback functions
+    AI::MXNet::Callback - A collection of predefined callback functions.
+=cut
+
+=head1 DESCRIPTION
+
+    A collection of predefined callback functions, mainly to be used in AI::MXNet::Module::Base::fit.
+=cut
+
+=head1 SYNOPSIS
+
+    my $model = mx->mod->Module(
+        symbol  => $net,
+        context => $contexts
+    );
+    $model->fit(
+        $data_iter,
+        eval_metric         => mx->metric->Perplexity,
+        kvstore             => $kv_store,
+        optimizer           => $optimizer,
+        optimizer_params    => {
+            learning_rate => $lr,
+            momentum      => $mom,
+            wd            => $wd,
+            clip_gradient => 5,
+            rescale_grad  => 1/$batch_size,
+            lr_scheduler  => AI::MXNet::FactorScheduler->new(step => 1000, factor => 0.99)
+        },
+        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
+        num_epoch           => $num_epoch,
+        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
+        ($chkp_epoch ? (epoch_end_callback  => [mx->callback->module_checkpoint($model, $chkp_prefix, $chkp_epoch), \&sample]) : ())
+    );
 =cut
 
 =head2 module_checkpoint
@@ -36,9 +67,9 @@ use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } };
     ----------
     $mod : subclass of AI::MXNet::Module::Base
         The module to checkpoint.
-    $prefix : str
+    $prefix : Str
         The file prefix to checkpoint to
-    $period=1 : int
+    $period=1 : Int
         How many epochs to wait before checkpointing. Default is 1.
     $save_optimizer_states=0 : Bool
         Whether to save optimizer states for later training.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
index e116e6e7a8d1..826e7baf905b 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
@@ -78,6 +78,13 @@ use overload
     This class governs the device context of AI::MXNet::NDArray objects.
 =cut
 
+=head1 SYNOPSIS
+
+    use AI::MXNet qw(mx);
+    print nd->array([[1,2],[3,4]], ctx => mx->cpu)->aspdl;
+    my $arr_gpu = nd->random->uniform(shape => [10, 10], ctx => mx->gpu(0));
+=cut
+
 =head2
 
     Constructing a context.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
index 9f6a0ab01600..c470acab60e7 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
@@ -21,6 +21,32 @@ use warnings;
 use AI::MXNet::Contrib::Symbol;
 use AI::MXNet::Contrib::NDArray;
 
+=head1 NAME
+
+    AI::MXNet::Contrib - An interface to experimental operators defined in C++ space.
+=cut
+
+=head1 SYNOPSIS
+
+    my $embed;
+    if($sparse_embedding)
+    {
+        my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse');
+        $embed = mx->sym->contrib->SparseEmbedding(
+            data=>$data, input_dim=>$num_words,
+            weight=>$embed_weight, output_dim=>$num_embed,
+            name=>'embed'
+        );
+    }
+    else
+    {
+        $embed = mx->sym->Embedding(
+            data=>$data, input_dim=>$num_words,
+            output_dim=>$num_embed, name=>'embed'
+        );
+    }
+=cut
+
 sub sym    { 'AI::MXNet::Contrib::Symbol'  }
 sub symbol { 'AI::MXNet::Contrib::Symbol'  }
 sub nd     { 'AI::MXNet::Contrib::NDArray' }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
index 0c1547e990a5..574ecc443f79 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
@@ -21,4 +21,14 @@ use warnings;
 use parent 'AI::MXNet::AutoLoad';
 sub config { ('contrib', 'AI::MXNet::NDArray') }
 
+=head1 NAME
+
+    AI::MXNet::Contrib::NDArray - An interface to experimental NDArray operators defined in C++ space.
+=cut
+
+=head1 SYNOPSIS
+
+    mx->contrib->ndarray->fft(nd->random->normal(0, 1, [3, 4], ctx => mx->gpu));
+=cut
+
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
index d84f831c0fa6..d5a041a085fe 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
@@ -21,4 +21,30 @@ use warnings;
 use parent 'AI::MXNet::AutoLoad';
 sub config { ('contrib', 'AI::MXNet::Symbol') }
 
+=head1 NAME
+
+    AI::MXNet::Contrib - An interface to experimental symbol operators defined in C++ space.
+=cut
+
+=head1 SYNOPSIS
+
+    my $embed;
+    if($sparse_embedding)
+    {
+        my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse');
+        $embed = mx->sym->contrib->SparseEmbedding(
+            data=>$data, input_dim=>$num_words,
+            weight=>$embed_weight, output_dim=>$num_embed,
+            name=>'embed'
+        );
+    }
+    else
+    {
+        $embed = mx->sym->Embedding(
+            data=>$data, input_dim=>$num_words,
+            output_dim=>$num_embed, name=>'embed'
+        );
+    }
+=cut
+
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm b/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
index 5fa66b26472e..b3272fe8b048 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
@@ -34,6 +34,11 @@ our %DTYPE_CPP_TO_STR = qw(
     int64_t  int64
 );
 
+=head1 NAME
+
+    AI::MXNet::CudaModule - Interface to runtime cuda kernel compile module.
+=cut
+
 =head1 DESCRIPTION
 
     Interface to runtime cuda kernel compile module.
@@ -81,12 +86,12 @@ our %DTYPE_CPP_TO_STR = qw(
 
     Parameters
     ----------
-    source : str
+    source : Str
         Complete source code.
-    options : array ref of str
+    options : Str|ArrayRef[Str]
         Compiler flags. For example, use "-I/usr/local/cuda/include" to
         add cuda headers to include path.
-    exports : array ref of str
+    exports : Str|ArrayRef[Str]
         Export kernel names.
 =cut
 
@@ -124,9 +129,9 @@ sub DEMOLISH
 
         Parameters
         ----------
-        name : str
+        $name : Str
             String name of the kernel.
-        signature : str
+        $signature : Str
             Function signature for the kernel. For example, if a kernel is
             declared as::
 
@@ -196,7 +201,7 @@ use AI::MXNet::Base;
 
 =head1 NAME
 
-    AI::MXNet::CudaKernel
+    AI::MXNet::CudaKernel - Constructs CUDA kernel.
 =cut
 
 =head1 DESCRIPTION
@@ -228,15 +233,15 @@ sub DEMOLISH
 
         Parameters
         ----------
-        $args : array ref of NDArray or numbers
+        $args : ArrayRef[AI::MXNet::NDArray|Num]
             List of arguments for kernel. NDArrays are expected for pointer
             types (e.g. `float*`, `double*`) while numbers are expected for
             non-pointer types (e.g. `int`, `float`).
         $ctx : AI::MXNet::Context
             The context to launch kernel on. Must be GPU context.
-        $grid_dims : array ref of 3 integers
+        $grid_dims : array ref of 3 integers (CudaKernelShape)
             Grid dimensions for CUDA kernel.
-        $block_dims : array ref of 3 integers
+        $block_dims : array ref of 3 integers (CudaKernelShape)
             Block dimensions for CUDA kernel.
         $shared_mem=0 : integer, optional
             Size of dynamically allocated shared memory. Defaults to 0.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm
index c4ee262dfc96..1d73e5584268 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm
@@ -20,9 +20,28 @@ use strict;
 use warnings;
 use AI::MXNet::Function::Parameters;
 use AI::MXNet::Base;
+
 =head1 NAME
 
-    AI::MXNet::Engine - Engine properties management.
+    AI::MXNet::Engine - Allows management of properties of the MXNet's engine.
+=cut
+
+=head1 SYNOPSIS
+
+    my $x;
+    mx->engine->bulk(10, sub {
+        $x = mx->nd->ones([10]);
+        $x *= 2;
+        $x += 1;
+        $x->wait_to_read();
+        $x += 1;
+        ok(($x->aspdl == 4)->all);
+        for my $i (1..100)
+        {
+            $x += 1;
+        }
+    });
+    ok(($x->aspdl == 104)->all);
 =cut
 
 =head2 set_bulk_size
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
index edcaabea1f47..573abbf588f2 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
@@ -32,7 +32,6 @@ has '_symbol'           => (is => 'rw', init_arg => 'symbol',    isa => 'AI::MXN
 has '_ctx'              => (is => 'rw', init_arg => 'ctx',       isa => 'AI::MXNet::Context' );
 has '_grad_req'         => (is => 'rw', init_arg => 'grad_req',  isa => 'Maybe[Str|ArrayRef[Str]|HashRef[Str]]');
 has '_group2ctx'        => (is => 'rw', init_arg => 'group2ctx', isa => 'Maybe[HashRef[AI::MXNet::Context]]');
-has '_monitor_callback' => (is => 'rw', isa => 'CodeRef');
 has [qw/_arg_dict
         _grad_dict
         _aux_dict
@@ -42,6 +41,18 @@ has [qw/_arg_dict
 =head1 NAME
 
     AI::MXNet::Executor - The actual executing object of MXNet.
+=cut
+
+=head1 SYNOPSIS
+
+    my $executor = $sym->bind(
+        ctx       => mx->Context('cpu'),
+        args      => [$lhs_arr, $rhs_arr],
+        args_grad => [$lhs_grad, $rhs_grad]
+    );
+    $executor->forward(1);
+    print $executor->outputs->[0]->aspdl;
+=cut
 
 =head2 new
 
@@ -138,7 +149,7 @@ method _get_outputs()
 
     Parameters
     ----------
-    $is_train=0: bool, optional
+    $is_train=0: Bool, optional
         whether this forward is for evaluation purpose. If True,
         a backward call is expected to follow. Otherwise following
         backward is invalid.
@@ -200,12 +211,12 @@ method forward(Int $is_train=0, %kwargs)
 
     Parameters
     ----------
-    out_grads : NDArray or an array ref of NDArrays or hash ref of NDArrays, optional.
+    $out_grads : NDArray or an array ref of NDArrays or hash ref of NDArrays, optional.
         The gradient on the outputs to be propagated back.
         This parameter is only needed when bind is called
         on outputs that are not a loss function.
 
-    is_train : bool, default 1
+    $is_train : Bool, default 1
         Whether this backward is for training or inference. Note that in rare
         cases you want to call backward with is_train=0 to get gradient
         during inference.
@@ -241,17 +252,16 @@ method backward(
 
     Parameters
     ----------
-    callback : subref
+    $callback : CodeRef
         Takes a string and an NDArrayHandle.
 =cut
 
 method set_monitor_callback(CodeRef $callback)
 {
-    $self->_monitor_callback($callback);
     check_call(
         AI::MXNetCAPI::ExecutorSetMonitorCallback(
             $self->handle,
-            $self->_monitor_callback
+            $callback
         )
     );
 }
@@ -262,7 +272,7 @@ method set_monitor_callback(CodeRef $callback)
 
     Returns
     -------
-    arg_dict : HashRef[AI::MXNet::NDArray]
+    $arg_dict : HashRef[AI::MXNet::NDArray]
         The map that maps a name of the arguments to the NDArrays.
 =cut
 
@@ -285,7 +295,7 @@ method arg_dict()
 
     Returns
     -------
-    grad_dict : HashRef[AI::MXNet::NDArray]
+    $grad_dict : HashRef[AI::MXNet::NDArray]
         The map that maps a name of the arguments to the gradient NDArrays.
 =cut
 
@@ -308,7 +318,7 @@ method grad_dict()
 
     Returns
     -------
-    aux_dict : HashRef[AI::MXNet::NDArray]
+    $aux_dict : HashRef[AI::MXNet::NDArray]
         The map that maps a name of the auxiliary states to the NDArrays.
 =cut
 
@@ -331,7 +341,7 @@ method aux_dict()
 
     Returns
     -------
-    output_dict : HashRef[AI::MXNet::NDArray]
+    $output_dict : HashRef[AI::MXNet::NDArray]
         The map that maps a name of the outputs to the NDArrays.
 =cut
 
@@ -354,13 +364,13 @@ method output_dict()
 
     Parameters
     ----------
-    arg_params : HashRef[AI::MXNet::NDArray]
+    $arg_params : HashRef[AI::MXNet::NDArray]
         Parameters, hash ref of name to NDArray of arguments
 
-    aux_params : Maybe[HashRef[AI::MXNet::NDArray]], optional
+    $aux_params= : Maybe[HashRef[AI::MXNet::NDArray]], optional
         Parameters, hash ref of name to NDArray of auxiliary states.
 
-    allow_extra_params : boolean, optional
+    $allow_extra_params= : Bool, optional
         Whether to allow extra parameters that are not needed by symbol
         If this is True, no error will be thrown when arg_params or aux_params
         contain extra parameters that is not needed by the executor.
@@ -415,9 +425,9 @@ method copy_params_from(
     ----------
     $kwargs : HashRef[Shape]
         new shape for arguments.
-    :$partial_shaping : bool
+    :$partial_shaping : Bool
         Whether to allow changing the shape of unspecified arguments.
-    :$allow_up_sizing : bool
+    :$allow_up_sizing : Bool
         Whether to allow allocating new ndarrays that's larger than the original.
 
     Returns
@@ -501,7 +511,7 @@ method reshape(HashRef[Shape] $kwargs, Int :$partial_shaping=0, Int :$allow_up_s
 
     Returns
     -------
-    debug_str : string
+    $debug_str : Str
         Debug string of the executor.
 =cut
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
index 7f92378c0823..92c8386c0d14 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
@@ -57,4 +57,106 @@ sub data { 'AI::MXNet::Gluon::Data' }
 sub utils { 'AI::MXNet::Gluon::Utils' }
 sub model_zoo { require AI::MXNet::Gluon::ModelZoo; 'AI::MXNet::Gluon::ModelZoo' }
 
+=head1 NAME
+
+    AI::MXNet::Gluon - High-level interface for MXNet.
+=cut
+
+=head1 DESCRIPTION
+
+    The AI::MXNet::Gluon package is a high-level interface for MXNet designed to be easy to use,
+    while keeping most of the flexibility of a low level API.
+    AI::MXNet::Gluon supports both imperative and symbolic programming,
+    making it easy to train complex models imperatively in Perl.
+
+    Based on the the Gluon API specification,
+    the Gluon API in Apache MXNet provides a clear, concise, and simple API for deep learning.
+    It makes it easy to prototype, build, and train deep learning models without sacrificing training speed.
+
+    Advantages.
+
+    Simple, Easy-to-Understand Code: Gluon offers a full set of plug-and-play neural network building blocks,
+    including predefined layers, optimizers, and initializers.
+
+    Flexible, Imperative Structure: Gluon does not require the neural network model to be rigidly defined,
+    but rather brings the training algorithm and model closer together to provide flexibility in the development process.
+
+    Dynamic Graphs: Gluon enables developers to define neural network models that are dynamic,
+    meaning they can be built on the fly, with any structure, and using any of Perl’s native control flow.
+
+    High Performance: Gluon provides all of the above benefits without impacting the training speed that the underlying engine provides.
+
+
+    Simple, Easy-to-Understand Code
+    Use plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers:
+
+    use AI::MXNet qw(mx);
+    use AI::MXNet::Gluon qw(gluon);
+
+    my $net = gluon->nn->Sequential;
+    # When instantiated, Sequential stores a chain of neural network layers.
+    # Once presented with data, Sequential executes each layer in turn, using
+    # the output of one layer as the input for the next
+    $net->name_scope(sub {
+        $net->add(gluon->nn->Dense(256, activation=>"relu")); # 1st layer (256 nodes)
+        $net->add(gluon->nn->Dense(256, activation=>"relu")); # 2nd hidden layer
+        $net->add(gluon->nn->Dense($num_outputs));
+    });
+
+    Flexible, Imperative Structure.
+
+    Prototype, build, and train neural networks in fully imperative manner using the AI::MXNet::MXNet package and the Gluon trainer method:
+
+    use AI::MXNet::Base; # provides helpers, such as zip, enumerate, etc.
+    use AI::MXNet::AutoGrad qw(autograd);
+    my $epochs = 10;
+
+    for(1..$epochs)
+    {
+        for(zip($train_data))
+        {
+            my ($data, $label) = @$_;
+            autograd->record(sub {
+                my $output = $net->($data); # the forward iteration
+                my $loss = gluon->loss->softmax_cross_entropy($output, $label);
+                $loss->backward;
+            });
+            $trainer->step($data->shape->[0]); ## batch size
+        }
+    }
+
+    Dynamic Graphs.
+
+    Build neural networks on the fly for use cases where neural networks must change in size and shape during model training:
+
+    use AI::MXNet::Function::Parameters;
+
+    method forward(GluonClass $F, GluonInput $inputs, GluonInput :$tree)
+    {
+        my $children_outputs = [
+            map { $self->forward($F, $inputs, $_) @{ $tree->children }
+        ];
+        #Recursively builds the neural network based on each input sentence’s
+        #syntactic structure during the model definition and training process
+        ...
+    }
+
+    High Performance
+
+    Easily cache the neural network to achieve high performance by defining your neural network with HybridSequential
+    and calling the hybridize method:
+
+    use AI::MXNet::Gluon::NN qw(nn);
+
+    my $net = nn->HybridSequential;
+    $net->name_scope(sub {
+        $net->add(nn->Dense(256, activation=>"relu"));
+        $net->add(nn->Dense(128, activation=>"relu"));
+        $net->add(nn->Dense(2));
+    });
+
+    $net->hybridize();
+    See more at L<Python docs|http://mxnet.incubator.apache.org/api/python/gluon/gluon.html>
+=cut
+
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
index be819ac9d4e5..1b35e7864c12 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
@@ -855,20 +855,20 @@ package AI::MXNet::Gluon::HybridBlock;
 
 =head2 DESCRIPTION
 
-    `HybridBlock` supports forwarding with both Symbol and NDArray.
+    HybridBlock supports forwarding with both Symbol and NDArray.
 
-    Forward computation in `HybridBlock` must be static to work with `Symbol`s,
-    i.e. you cannot call `.asnumpy()`, `.shape`, `.dtype`, etc on tensors.
+    Forward computation in HybridBlock must be static to work with Symbols,
+    i.e. you cannot call aspdl, shape, dtype, etc on tensors.
     Also, you cannot use branching or loop logic that bases on non-constant
     expressions like random numbers or intermediate results, since they change
     the graph structure for each iteration.
 
-    Before activating with `hybridize()`, `HybridBlock` works just like normal
-    `Block`. After activation, `HybridBlock` will create a symbolic graph
+    Before activating with hybridize(), HybridBlock works just like normal
+    Block. After activation, HybridBlock will create a symbolic graph
     representing the forward computation and cache it. On subsequent forwards,
-    the cached graph will be used instead of `hybrid_forward`.
+    the cached graph will be used instead of hybrid_forward.
 
-    Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
+    Refer Hybrid tutorial L<http://mxnet.io/tutorials/gluon/hybrid.html> to see
     the end-to-end usage.
 =cut
 
@@ -1141,7 +1141,7 @@ method _call_cached_op(@args)
 =head2 forward
 
         Defines the forward computation. Arguments can be either
-        `NDArray` or `Symbol`.
+        NDArray or Symbol
 =cut
 
 method forward($x, @args)
@@ -1225,12 +1225,12 @@ method hybrid_forward($F, $x, @args)
         or the C++ interface.
 
         When there are only one input, it will have name 'data'. When there
-        Are more than one inputs, they will be named as `data0`, `data1`, etc.
+        Are more than one inputs, they will be named as 'data0', 'data1', etc.
 
         Parameters
         ----------
         $path : str
-            Path to save model. Two files `path-symbol.json` and `path-xxxx.params`
+            Path to save model. Two files 'path-symbol.json' and 'path-xxxx.params'
             will be created, where xxxx is the 4 digits epoch number.
         :$epoch=0 : Int
             Epoch number of saved model.
@@ -1298,20 +1298,20 @@ extends 'AI::MXNet::Gluon::HybridBlock';
 
     Examples
     --------
-    >>> # To extract the feature from fc1 and fc2 layers of AlexNet:
-    >>> alexnet = gluon.model_zoo.vision.alexnet(pretrained=True, ctx=mx.cpu(),
-                                                 prefix='model_')
-    >>> inputs = mx.sym.var('data')
-    >>> out = alexnet(inputs)
-    >>> internals = out.get_internals()
-    >>> print(internals.list_outputs())
+    >>> # To extract the feature from fc1 and fc2 layers of AlexNet
+    >>> $alexnet = gluon->model_zoo->vision->alexnet(pretrained=>1, ctx=>mx->cpu(),
+                                                 prefix=>'model_');
+    >>> $inputs = mx->sym->var('data');
+    >>> $out = $alexnet->($inputs);
+    >>> $internals = $out->get_internals()
+    >>> print($internals->list_outputs())
     ['data', ..., 'model_dense0_relu_fwd_output', ..., 'model_dense1_relu_fwd_output', ...]
-    >>> outputs = [internals['model_dense0_relu_fwd_output'],
-                   internals['model_dense1_relu_fwd_output']]
+    >>> $outputs = [$internals->slice('model_dense0_relu_fwd_output'),
+                   $internals->slice('model_dense1_relu_fwd_output')];
     >>> # Create SymbolBlock that shares parameters with alexnet
-    >>> feat_model = gluon.SymbolBlock(outputs, inputs, params=alexnet.collect_params())
-    >>> x = mx.nd.random_normal(shape=(16, 3, 224, 224))
-    >>> print(feat_model(x))
+    >>> $feat_model = gluon->SymbolBlock($outputs, $inputs, params=>$alexnet->collect_params());
+    >>> $x = mx->nd->random_normal(shape=>[16, 3, 224, 224]);
+    >>> print($feat_model->($x));
 =cut
 
 has [qw/outputs inputs/] => (is => 'rw', isa => 'AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]');
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
index c39d5d461c2c..475c2a93647e 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
@@ -934,7 +934,6 @@ use overload
         my $content = join("\n", map { AI::MXNet::Base::_indent("   $_", 2) } $self->values);
         return "$name(\n$content\n)";
     },
-    '%{}'  => sub { my %tmp = shift->_params->as_list; \%tmp },
     '@{}'  => sub { my @tmp = shift->_params->as_list; \@tmp },
     fallback => 1;
 
@@ -1316,7 +1315,7 @@ method load(
             );
             next;
         }
-        $self->{ $name }->_load_init($arg_dict{$name}, $ctx);
+        $self->_params->get($name)->_load_init($arg_dict{$name}, $ctx);
     }
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
index fc3f960cc496..297ceb8c0b24 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
@@ -24,7 +24,15 @@ use Scalar::Util qw/blessed/;
 
 =head1 NAME
 
-    AI::MXNet::IO - NDArray interface of mxnet.
+    AI::MXNet::IO - Data loading interface of MXNet
+=cut
+
+=head1 DESCRIPTION
+
+    This document summarizes supported data formats and iterator APIs to read the data including
+    mx->io              Data iterators for common data formats.
+    mx->recordio        Data iterators for the RecordIO data format.
+    mx->image           Image Iterators and image augmentation functions.
 =cut
 
 # Convert data into canonical form.
@@ -626,6 +634,21 @@ extends 'AI::MXNet::DataIter';
     AI::MXNet::MXDataIter - A data iterator pre-built in C++ layer of MXNet.
 =cut
 
+=head1 DESCRIPTION
+
+    Here are the list of currently available predefined iterators, for more custom iterators
+    please check out the examples directory.
+    Also please refer to the L<Python docs|http://mxnet.incubator.apache.org/api/python/io/io.html>
+    mx->io->CSVIter                     Returns the CSV file iterator.
+    mx->io->LibSVMIter                  Returns the LibSVM iterator which returns data with csr storage type.
+    mx->io->ImageRecordIter             Iterates on image RecordIO files
+    mx->io->ImageRecordUInt8Iter        Iterating on image RecordIO files
+    mx->io->MNISTIter                   Iterating on the MNIST dataset.
+    mx->recordio->MXRecordIO            Reads/writes RecordIO data format, supporting sequential read and write.
+    mx->recordio->MXIndexedRecordIO     Reads/writes RecordIO data format, supporting random access.
+    mx->image->ImageIter                Image data iterator with a large number of augmentation choices.
+=cut
+
 has 'handle'           => (is => 'ro', isa => 'DataIterHandle', required => 1);
 has '_debug_skip_load' => (is => 'rw', isa => 'Int', default => 0);
 has '_debug_at_begin'  => (is => 'rw', isa => 'Int', default => 0);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
index 4f670b0e8e36..9c7fa120f343 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
@@ -622,18 +622,18 @@ method CastAug()
 =cut
 
 method CreateAugmenter(
-Shape          :$data_shape,
-Bool           :$resize=0,
-Bool           :$rand_crop=0,
-Bool           :$rand_resize=0,
-Bool           :$rand_mirror=0,
-Maybe[Num|PDL] :$mean=,
-Maybe[Num|PDL] :$std=,
-Num            :$brightness=0,
-Num            :$contrast=0,
-Num            :$saturation=0,
-Num            :$pca_noise=0,
-Int            :$inter_method=2
+    Shape          :$data_shape,
+    Bool           :$resize=0,
+    Bool           :$rand_crop=0,
+    Bool           :$rand_resize=0,
+    Bool           :$rand_mirror=0,
+    Maybe[Num|PDL] :$mean=,
+    Maybe[Num|PDL] :$std=,
+    Num            :$brightness=0,
+    Num            :$contrast=0,
+    Num            :$saturation=0,
+    Num            :$pca_noise=0,
+    Int            :$inter_method=2
 )
 {
     my @auglist;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
index 7c481efcc53f..fe8dce32e2d8 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -73,6 +73,24 @@ has '_print_func' => (is => 'rw', isa => 'CodeRef', lazy => 1,
 
     AI::MXNet::Initializer - Base class for all Initializers
 
+=head1 DESCRIPTION
+
+    The base class AI::MXNet::Initializer defines the default behaviors to initialize various parameters,
+    such as set bias to 1, except for the weight. Other classes then define how to initialize the weights.
+    Currently following classes are available:
+    mx->init->Uniform    Initializes weights with random values uniformly sampled from a given range.
+    mx->init->Normal     Initializes weights with random values sampled from a normal distribution with a mean of zero and standard deviation of sigma.
+    mx->init->Load       Initializes variables by loading data from file or dict.
+    mx->init->Mixed      Initialize parameters using multiple initializers.
+    mx->init->Zero       Initializes weights to zero.
+    mx->init->One        Initializes weights to one.
+    mx->init->Constant   Initializes the weights to a given value.
+    mx->init->Orthogonal Initialize weight as orthogonal matrix.
+    mx->init->Xavier     Returns an initializer performing “Xavier” initialization for weights.
+    mx->init->MSRAPrelu  Initialize the weight according to a MSRA paper.
+    mx->init->Bilinear   Initialize weight for upsampling layers.
+    mx->init->FusedRNN   Initialize parameters for fused rnn layers.
+
 =head2 register
 
     Register an initializer class to the AI::MXNet::Initializer factory.
@@ -372,7 +390,7 @@ method call(Str $name, AI::MXNet::NDArray $arr)
 
 =head1 NAME
 
-    AI::MXNet::Mixed - A container for multiple initializer patterns.
+    AI::MXNet::Mixed - A container with multiple initializer patterns.
 =cut
 
 =head2 new
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
index de66d91552c2..bb6631f459a9 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
@@ -37,7 +37,6 @@ use AI::MXNet::Function::Parameters;
 
 has 'handle' => (is => 'ro', isa => 'KVStoreHandle', required => 1);
 has '_updater' => (is => 'rw',  isa => 'AI::MXNet::Updater');
-has '_updater_func' => (is => 'rw', isa => 'CodeRef');
 
 sub DEMOLISH
 {
@@ -53,9 +52,9 @@ sub DEMOLISH
 
     Parameters
     ----------
-    key : str or an array ref of str
+    $key : Str|ArrayRef[Str]
         The keys.
-    value : NDArray or an array ref of NDArray objects
+    $value : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
         The values.
 
     Examples
@@ -100,9 +99,9 @@ method init(
 
     Parameters
     ----------
-    key : str or array ref of str
-    value : NDArray or array ref of NDArray or array ref of array refs of NDArray
-    priority : int, optional
+    $key : Str|ArrayRef[Str]
+    $value : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
+    :$priority=0 : Int, optional
         The priority of the push operation.
         The higher the priority, the faster this action is likely
         to be executed before other push actions.
@@ -171,12 +170,12 @@ method push(
 
     Parameters
     ----------
-    key : str or array ref of str
+    $key : Str|ArrayRef[Str]
         Keys
-    out: NDArray or array ref of NDArray or array ref of array refs of NDArray
+    :$out: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
         According values
 
-    priority : int, optional
+    :$priority=0 : Int, optional
         The priority of the push operation.
         The higher the priority, the faster this action is likely
         to be executed before other push actions.
@@ -241,18 +240,18 @@ method pull(
 
         Parameters
         ----------
-        key : str, int, or sequence of str or int
+        $key : Str|ArrayRef[Str] $key
             Keys.
 
-        out: AI::MXNet::NDArray::RowSparse or array ref of AI::MXNet::NDArray::RowSparse or array ref of array ref of AI::MXNet::NDArray::RowSparse
+        :$out: AI::MXNet::NDArray::RowSparse|ArrayRef[AI::MXNet::NDArray::RowSparse]|ArrayRef[ArrayRef[AI::MXNet::NDArray::RowSparse]]
             Values corresponding to the keys. The stype is expected to be row_sparse
 
-        priority : int, optional
+        :$priority=0 : Int, optional
             The priority of the pull operation.
             Higher priority pull operations are likely to be executed before
             other pull actions.
 
-        row_ids : AI::MXNet::NDArray or array ref of AI::MXNet::NDArray
+        :$row_ids : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
             The row_ids for which to pull for each value. Each row_id is an 1D NDArray
             whose values don't have to be unique nor sorted.
 
@@ -364,7 +363,7 @@ method row_sparse_pull(
 
         Parameters
         ----------
-        compression_params : HashRef
+        $compression_params : HashRef[Str]
             A dictionary specifying the type and parameters for gradient compression.
             The key `type` in this dictionary is a
             required string argument and specifies the type of gradient compression.
@@ -401,7 +400,7 @@ method set_gradient_compression(HashRef[Str] $compression_params)
 
     Parameters
     ----------
-    optimizer : Optimizer
+    $optimizer : AI::MXNet::Optimizer
         the optimizer
 =cut
 
@@ -426,7 +425,7 @@ method set_optimizer(AI::MXNet::Optimizer $optimizer)
 
     Returns
     -------
-    type : str
+    $type : Str
         the string type
 =cut
 
@@ -441,7 +440,7 @@ method type()
 
     Returns
     -------
-    rank : int
+    $rank : Int
         The rank of this node, which is in [0, get_num_workers())
 =cut
 
@@ -456,7 +455,7 @@ method rank()
 
     Returns
     -------
-    size :int
+    $size : Int
         The number of worker nodes
 =cut
 
@@ -471,9 +470,9 @@ method num_workers()
 
     Parameters
     ----------
-    fname : str
+    $fname : Str
         Path to output states file.
-    dump_optimizer : bool, default False
+    :$dump_optimizer=0 : Bool, default False
             Whether to also save the optimizer itself. This would also save optimizer
             information such as learning rate and weight decay schedules.
 =cut
@@ -493,7 +492,7 @@ method save_optimizer_states(Str $fname, Bool :$dump_optimizer=0)
 
     Parameters
     ----------
-    fname : str
+    $fname : Str
         Path to input states file.
 =cut
 
@@ -517,7 +516,7 @@ method load_optimizer_states(Str $fname)
 
     Parameters
     ----------
-    updater : function
+    $updater : Undater
         the updater function
 
     Examples
@@ -540,20 +539,17 @@ method load_optimizer_states(Str $fname)
 
 method _set_updater(Updater $updater_func)
 {
-    $self->_updater_func(
-        sub {
-            my ($index, $input_handle, $storage_handle) = @_;
-            $updater_func->(
-                $index,
-                AI::MXNet::NDArray->_ndarray_cls($input_handle),
-                AI::MXNet::NDArray->_ndarray_cls($storage_handle)
-            );
-        }
-    );
     check_call(
         AI::MXNetCAPI::KVStoreSetUpdater(
             $self->handle,
-            $self->_updater_func
+            sub {
+                my ($index, $input_handle, $storage_handle) = @_;
+                $updater_func->(
+                    $index,
+                    AI::MXNet::NDArray->_ndarray_cls($input_handle),
+                    AI::MXNet::NDArray->_ndarray_cls($storage_handle)
+                );
+            }
         )
     );
 }
@@ -583,9 +579,9 @@ method _barrier()
 
     Parameters
     ----------
-    head : int
+    $head : Int
         the head of the command
-    body : str
+    $body : Str
         the body of the command
 =cut
 
@@ -606,7 +602,7 @@ method _send_command_to_servers(Int $head, Str $body)
 
     Parameters
     ----------
-    name : {'local'}
+    $name='local' : Str
     The type of KVStore
         - local works for multiple devices on a single machine (single process)
         - dist works for multi-machines (multiple processes)
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
index 4c274b92c71f..39e152a6d641 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
@@ -27,7 +27,7 @@ use AI::MXNet::Function::Parameters;
 
 =head1 NAME
 
-    AI::MXNet::KVStoreServer - The key-value store server
+    AI::MXNet::KVStoreServer - The key-value store server.
 =cut
 
 =head2 new
@@ -36,7 +36,7 @@ use AI::MXNet::Function::Parameters;
 
     Parameters
     ----------
-    kvstore : KVStore
+    kvstore : AI::MXNet::KVStore
 =cut
 
 has 'kvstore' => (is => 'ro', isa => 'AI::MXNet::KVStore', required => 1);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
index 27420f45167d..5575e37f75fe 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
@@ -58,7 +58,7 @@ has 'base_lr' => (is => 'rw', isa => 'Num', default => 0.01);
 
     Parameters
     ----------
-    num_update: int
+    $num_update: Int
         the maximal number of updates applied to a weight.
 =cut
 
@@ -76,9 +76,9 @@ package AI::MXNet::FactorScheduler;
 
     Parameters
     ----------
-    step: int
+    step: Int
         schedule the learning rate update after n updates
-    factor: float
+    factor: Num
         the factor by which to reduce the learning rate.
 =cut
 use Mouse;
@@ -138,9 +138,9 @@ package AI::MXNet::MultiFactorScheduler;
 
     Parameters
     ----------
-    step: array ref of int
+    step: ArrayRef[Int]
         schedule learning rate after n updates
-    factor: float
+    factor: Num
         the factor for reducing the learning rate
 =cut
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm
index 9290e68d4561..be1262fb6a87 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm
@@ -21,6 +21,54 @@ use warnings;
 use AI::MXNet::LinAlg::Symbol;
 use AI::MXNet::LinAlg::NDArray;
 
+=head1 NAME
+
+    AI::MXNet::LinAlg - Linear Algebra routines for NDArray and Symbol.
+=cut
+
+=head1 DESCRIPTION
+
+    The Linear Algebra API, provides imperative/symbolic linear algebra tensor operations on CPU/GPU.
+
+    mx->linalg-><sym|nd>->gemm  Performs general matrix multiplication and accumulation.
+    mx->linalg-><sym|nd>->gemm2 Performs general matrix multiplication.
+    mx->linalg-><sym|nd>->potrf Performs Cholesky factorization of a symmetric positive-definite matrix.
+    mx->linalg-><sym|nd>->potri Performs matrix inversion from a Cholesky factorization.
+    mx->linalg-><sym|nd>->trmm  Performs multiplication with a lower triangular matrix.
+    mx->linalg-><sym|nd>->trsm  Solves matrix equation involving a lower triangular matrix.
+    mx->linalg-><sym|nd>->sumlogdiag    Computes the sum of the logarithms of the diagonal elements of a square matrix.
+    mx->linalg-><sym|nd>->syrk  Multiplication of matrix with its transpose.
+    mx->linalg-><sym|nd>->gelqf LQ factorization for general matrix.
+    mx->linalg-><sym|nd>->syevd Eigendecomposition for symmetric matrix.
+    L<NDArray Python Docs|http://mxnet.incubator.apache.org/api/python/ndarray/linalg.html>
+    L<Symbol Python Docs|http://mxnet.incubator.apache.org/api/python/symbol/linalg.html>
+
+    Examples:
+
+    ## NDArray
+    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
+    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
+    ok(almost_equal(
+        mx->nd->linalg->gemm2($A, $B, transpose_b=>1, alpha=>2.0)->aspdl,
+        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
+    ));
+
+    ## Symbol
+    my $sym_gemm2 = mx->sym->linalg->gemm2(
+        mx->sym->var('A'),
+        mx->sym->var('B'),
+        transpose_b => 1,
+        alpha => 2.0
+    );
+    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
+    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
+    ok(almost_equal(
+        $sym_gemm2->eval(args => { A => $A, B => $B })->[0]->aspdl,
+        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
+    ));
+
+=cut
+
 sub sym    { 'AI::MXNet::LinAlg::Symbol'  }
 sub symbol { 'AI::MXNet::LinAlg::Symbol'  }
 sub nd     { 'AI::MXNet::LinAlg::NDArray' }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
index 3b9345d8baf9..b6e91aeaf729 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
@@ -24,7 +24,11 @@ use JSON::PP;
 
 =head1 NAME
 
-    AI::MXNet::Metric - Online evaluation metric module.
+    AI::MXNet::Metric - Evaluation Metric API.
+=head1 DESCRIPTION
+
+    This module hosts all the evaluation metrics available to evaluate the performance of a learned model.
+    L<Python Docs|http://mxnet.incubator.apache.org/api/python/metric/metric.html>
 =cut
 
 # Check to see if the two arrays are the same size.
@@ -61,11 +65,6 @@ func check_label_shapes(
     ) unless $pred_shape == $label_shape;
 }
 
-=head1 DESCRIPTION
-
-    Base class of all evaluation metrics.
-=cut
-
 package AI::MXNet::EvalMetric;
 use Mouse;
 use overload '""' => sub {
@@ -232,11 +231,41 @@ method get()
 # CLASSIFICATION METRICS
 ########################
 
+=head1 NAME
+
+    AI::MXNet::Accuracy - Computes accuracy classification score.
+=cut
+
+=head1 DESCRIPTION
+
+    The accuracy score is defined as
+
+    accuracy(y, y^) = (1/n) * sum(i=0..n−1) { y^(i)==y(i) }
+
+    Parameters:
+    axis (Int, default=1) – The axis that represents classes.
+    name (Str, default='accuracy') – Name of this metric instance for display.
+
+    pdl> use AI::MXNet qw(mx)
+    pdl> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    pdl> $labels   = [mx->nd->array([[0, 1, 1]])]
+    pdl> $acc = mx->metric->Accuracy()
+    pdl> $acc->update($labels, $predicts)
+    pdl> use Data::Dumper
+    pdl> print Dumper([$acc->get])
+    $VAR1 = [
+          'accuracy',
+          '0.666666666666667'
+    ];
+
+=cut
+
 package AI::MXNet::Accuracy;
 use Mouse;
 use AI::MXNet::Base;
 extends 'AI::MXNet::EvalMetric';
 has '+name'   => (default => 'accuracy');
+has 'axis'    => (is => 'ro', isa => 'Int', default => 1);
 
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
@@ -245,22 +274,74 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         my ($label, $pred_label) = @$_;
         if(join(',', @{$pred_label->shape}) ne join(',', @{$label->shape}))
         {
-            $pred_label = AI::MXNet::NDArray->argmax_channel($pred_label);
+            $pred_label = AI::MXNet::NDArray->argmax_channel($pred_label, { axis => $self->axis });
         }
-        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
         my $sum = ($pred_label->aspdl->flat == $label->aspdl->flat)->sum;
         $self->sum_metric($self->sum_metric + $sum);
         $self->num_inst($self->num_inst + $pred_label->size);
     }
 }
 
+=head1 NAME
+
+    AI::MXNet::TopKAccuracy - Computes top k predictions accuracy.
+=cut
+
+=head1 DESCRIPTION
+
+    TopKAccuracy differs from Accuracy in that it considers the prediction
+    to be True as long as the ground truth label is in the top K predicated labels.
+
+    If top_k = 1, then TopKAccuracy is identical to Accuracy.
+
+    Parameters:	
+    top_k(Int, default 1) – Whether targets are in top k predictions.
+    name (Str, default 'top_k_accuracy') – Name of this metric instance for display.
+
+    use AI::MXNet qw(mx);
+    $top_k = 3;
+    $predicts = [mx->nd->array(
+      [[0.80342804, 0.5275223 , 0.11911147, 0.63968144, 0.09092526,
+        0.33222568, 0.42738095, 0.55438581, 0.62812652, 0.69739294],
+       [0.78994969, 0.13189035, 0.34277045, 0.20155961, 0.70732423,
+        0.03339926, 0.90925004, 0.40516066, 0.76043547, 0.47375838],
+       [0.28671892, 0.75129249, 0.09708994, 0.41235779, 0.28163896,
+        0.39027778, 0.87110921, 0.08124512, 0.55793117, 0.54753428],
+       [0.33220307, 0.97326881, 0.2862761 , 0.5082575 , 0.14795074,
+        0.19643398, 0.84082001, 0.0037532 , 0.78262101, 0.83347772],
+       [0.93790734, 0.97260166, 0.83282304, 0.06581761, 0.40379256,
+        0.37479349, 0.50750135, 0.97787696, 0.81899021, 0.18754124],
+       [0.69804812, 0.68261077, 0.99909815, 0.48263116, 0.73059268,
+        0.79518236, 0.26139168, 0.16107376, 0.69850315, 0.89950917],
+       [0.91515562, 0.31244902, 0.95412616, 0.7242641 , 0.02091039,
+        0.72554552, 0.58165923, 0.9545687 , 0.74233195, 0.19750339],
+       [0.94900651, 0.85836332, 0.44904621, 0.82365038, 0.99726878,
+        0.56413064, 0.5890016 , 0.42402702, 0.89548786, 0.44437266],
+       [0.57723744, 0.66019353, 0.30244304, 0.02295771, 0.83766937,
+        0.31953292, 0.37552193, 0.18172362, 0.83135182, 0.18487429],
+       [0.96968683, 0.69644561, 0.60566253, 0.49600661, 0.70888438,
+        0.26044186, 0.65267488, 0.62297362, 0.83609334, 0.3572364 ]]
+    )];
+    $labels = [mx->nd->array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])];
+    $acc = mx->metric->TopKAccuracy(top_k=>$top_k);
+    $acc->update($labels, $predicts);
+    use Data::Dumper;
+    print Dumper([$acc->get]);
+    $VAR1 = [
+          'top_k_accuracy_3',
+          '0.3'
+    ];
+
+
+=cut
+
 package AI::MXNet::TopKAccuracy;
 use Mouse;
 use List::Util qw/min/;
 use AI::MXNet::Base;
 extends 'AI::MXNet::EvalMetric';
 has '+name'   => (default => 'top_k_accuracy');
-has 'top_k' => (is => 'rw', isa => 'int', default => 1);
+has 'top_k' => (is => 'rw', isa => 'Int', default => 1);
 method python_constructor_arguments() { ['top_k'] }
 
 sub BUILD
@@ -302,71 +383,250 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
     }
 }
 
-# Calculate the F1 score of a binary classification problem.
-package AI::MXNet::F1;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'f1');
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred_label) = @$_;
-        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
-        $pred_label = $pred_label->aspdl->maximum_ind;
+package _BinaryClassificationMetrics {
+    use Mouse;
+    #Private container class for classification metric statistics. True/false positive and
+    # true/false negative counts are sufficient statistics for various classification metrics.
+    #This class provides the machinery to track those statistics across mini-batches of
+    #(label, prediction) pairs.
+    has [qw/true_positives
+            false_negatives
+            false_positives
+            true_negatives/] => (is => 'rw', isa => 'Int', default => 0);
+
+    method update_binary_stats(AI::MXNet::NDArray $label, AI::MXNet::NDArray $pred)
+    {
+        $pred = AI::MXNet::NDArray->argmax($pred, { axis => 1 })->aspdl;
         $label = $label->astype('int32')->aspdl;
-        confess("F1 currently only supports binary classification.")
-            if $label->uniq->shape->at(0) > 2;
-        my ($true_positives, $false_positives, $false_negatives) = (0,0,0);
-        for(zip($pred_label->unpdl, $label->unpdl)) {
-            my ($y_pred, $y_true) = @$_;
-            if($y_pred == 1 and $y_true == 1)
-            {
-                $true_positives += 1;
-            }
-            elsif($y_pred == 1 and $y_true == 0)
-            {
-                $false_positives += 1;
-            }
-            elsif($y_pred == 0 and $y_true == 1)
-            {
-                $false_negatives += 1;
-            }
+
+        AI::MXNet::Metric::check_label_shapes($label, $pred);
+        if($label->uniq->len > 2)
+        {
+            confess("Currently only support binary classification.");
         }
-        my $precision;
-        my $recall;
-        if($true_positives + $false_positives > 0)
+
+        my $pred_true = ($pred == 1);
+        my $pred_false = 1 - $pred_true;
+        my $label_true = ($label == 1);
+        my $label_false = 1 - $label_true;
+
+        $self->true_positives($self->true_positives + ($pred_true * $label_true)->sum);
+        $self->false_positives($self->false_positives + ($pred_true * $label_false)->sum);
+        $self->false_negatives($self->false_negatives + ($pred_false * $label_true)->sum);
+        $self->true_negatives($self->true_negatives + ($pred_false * $label_false)->sum);
+    }
+
+    method precision()
+    {
+        if($self->true_positives + $self->false_positives > 0)
         {
-            $precision = $true_positives / ($true_positives + $false_positives);
+            return $self->true_positives / ($self->true_positives + $self->false_positives);
         }
         else
         {
-            $precision = 0;
+            return 0;
         }
-        if($true_positives + $false_negatives > 0)
+    }
+
+    method recall()
+    {
+        if($self->true_positives + $self->false_negatives > 0)
         {
-            $recall = $true_positives / ($true_positives +  $false_negatives);
+            return $self->true_positives / ($self->true_positives + $self->false_negatives);
         }
         else
         {
-            $recall = 0;
+            return 0;
         }
-        my $f1_score;
-        if($precision + $recall > 0)
+    }
+
+    method fscore()
+    {
+        if($self->precision + $self->recall > 0)
         {
-            $f1_score = 2 * $precision * $recall / ($precision + $recall);
+            return 2 * $self->precision * $self->recall / ($self->precision + $self->recall);
         }
         else
         {
-            $f1_score = 0;
+            return 0;
+        }
+    }
+
+    method matthewscc()
+    {
+        if(not $self->total_examples)
+        {
+            return 0;
+        }
+        my @terms = (
+            $self->true_positives + $self->false_positives,
+            $self->true_positives + $self->false_negatives,
+            $self->true_negatives + $self->false_positives,
+            $self->true_negatives + $self->false_negatives
+        );
+        my $denom = 1;
+        for my $t (grep { $_ } @terms)
+        {
+            $denom *= $t;
+        }
+        return (($self->true_positives * $self->true_negatives) - ($self->false_positives * $self->false_negatives)) / sqrt($denom);
+    }
+
+    method total_examples()
+    {
+        return $self->false_negatives + $self->false_positives +
+               $self->true_negatives + $self->true_positives;
+    }
+
+    method reset_stats()
+    {
+        $self->false_positives(0);
+        $self->false_negatives(0);
+        $self->true_positives(0);
+        $self->true_negatives(0);
+    }
+};
+
+=head1 NAME
+
+    AI::MXNet::F1 - Calculate the F1 score of a binary classification problem.
+=cut
+
+=head1 DESCRIPTION
+
+    The F1 score is equivalent to harmonic mean of the precision and recall,
+    where the best value is 1.0 and the worst value is 0.0. The formula for F1 score is:
+
+    F1 = 2 * (precision * recall) / (precision + recall)
+    The formula for precision and recall is:
+
+    precision = true_positives / (true_positives + false_positives)
+    recall    = true_positives / (true_positives + false_negatives)
+    Note:
+
+    This F1 score only supports binary classification.
+
+    Parameters:
+    name (Str, default 'f1') – Name of this metric instance for display.
+    average (Str, default 'macro') –
+    Strategy to be used for aggregating across mini-batches.
+    “macro”: average the F1 scores for each batch. “micro”: compute a single F1 score across all batches.
+
+
+    $predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])];
+    $labels   = [mx.nd.array([0., 1., 1.])];
+    $f1 = mx->metric->F1();
+    $f1->update($labels, $predicts);
+    print $f1->get;
+    f1 0.8
+
+=cut
+
+package AI::MXNet::F1;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'f1');
+has 'average' => (is => 'ro', isa => 'Str', default => 'macro');
+has 'metrics' => (is => 'rw', init_arg => undef, default => sub { _BinaryClassificationMetrics->new });
+has 'method'  => (is => 'ro', init_arg => undef, default => 'fscore');
+method python_constructor_arguments() { [qw/name average/] }
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    my $method = $self->method;
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
+        $self->metrics->update_binary_stats($label, $pred);
+        if($self->average eq "macro")
+        {
+            $self->sum_metric($self->sum_metric + $self->metrics->$method);
+            $self->num_inst($self->num_inst + 1);
+            $self->metrics->reset_stats();
+        }
+        else
+        {
+            $self->sum_metric($self->metrics->fscore * $self->metrics->total_examples);
+            $self->num_inst($self->metrics->total_examples);
         }
-        $self->sum_metric($self->sum_metric + $f1_score);
-        $self->num_inst($self->num_inst + 1);
     }
 }
 
+method reset()
+{
+    $self->sum_metric(0);
+    $self->num_inst(0);
+    $self->metrics->reset_stats();
+}
+
+=head1 NAME
+
+    AI::MXNet::MCC - Computes the Matthews Correlation Coefficient of a binary classification problem.
+=cut
+
+=head1 DESCRIPTION
+
+    While slower to compute than F1 the MCC can give insight that F1 or Accuracy cannot.
+    For instance, if the network always predicts the same result
+    then the MCC will immeadiately show this. The MCC is also symetric with respect
+    to positive and negative categorization, however, there needs to be both
+    positive and negative examples in the labels or it will always return 0.
+    MCC of 0 is uncorrelated, 1 is completely correlated, and -1 is negatively correlated.
+
+        MCC = (TP * TN - FP * FN)/sqrt( (TP + FP)*( TP + FN )*( TN + FP )*( TN + FN ) )
+
+    where 0 terms in the denominator are replaced by 1.
+
+    This version of MCC only supports binary classification.
+
+    Parameters
+    ----------
+    name : str, 'mcc'
+        Name of this metric instance for display.
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average the MCC for each batch.
+            "micro": compute a single MCC across all batches.
+
+    Examples
+    --------
+    In this example the network almost always predicts positive
+    >>> $false_positives = 1000
+    >>> $false_negatives = 1
+    >>> $true_positives = 10000
+    >>> $true_negatives = 1
+    >>> $predicts = [mx->nd->array(
+        [
+            ([.3, .7])x$false_positives,
+            ([.7, .3])x$true_negatives,
+            ([.7, .3])x$false_negatives,
+            ([.3, .7])xtrue_positives
+        ]
+    )];
+    >>> $labels  = [mx->nd->array(
+        [
+            (0)x($false_positives + $true_negatives),
+            (1)x($false_negatives + $true_positives)
+        ]
+    )];
+    >>> $f1 = mx->metric->F1();
+    >>> $f1->update($labels, $predicts);
+    >>> $mcc = mx->metric->MCC()
+    >>> $mcc->update($labels, $predicts)
+    >>> print $f1->get();
+    f1 0.95233560306652054
+    >>> print $mcc->get();
+    mcc 0.01917751877733392
+
+=cut
+
+package AI::MXNet::MCC;
+use Mouse;
+extends 'AI::MXNet::F1';
+has '+name'   => (default => 'mcc');
+has '+method' => (default => 'matthewscc');
+
 package AI::MXNet::Perplexity;
 use Mouse;
 use AI::MXNet::Base;
@@ -385,12 +645,13 @@ around BUILDARGS => sub {
 
 =head1 NAME
 
-    AI::MXNet::Perplexity
+    AI::MXNet::Perplexity - Calculate perplexity.
 =cut
 
 =head1 DESCRIPTION
 
-    Calculate perplexity.
+    Perplexity is a measurement of how well a probability distribution or model predicts a sample.
+    A low perplexity indicates the model is good at predicting the sample.
 
     Parameters
     ----------
@@ -402,6 +663,14 @@ around BUILDARGS => sub {
         The axis from prediction that was used to
         compute softmax. By default uses the last
         axis.
+
+    $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])];
+    $labels   = [mx->nd->array([0, 1, 1])];
+    $perp = mx->metric->Perplexity(ignore_label=>undef);
+    $perp->update($labels, $predicts);
+    print $perp->get()
+    Perplexity 1.77109762851559
+
 =cut
 
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
@@ -440,7 +709,21 @@ method get()
 # REGRESSION METRICS
 ####################
 
-# Calculate Mean Absolute Error loss
+=head1 NAME
+
+    AI::MXNet::MAE - Calculate Mean Absolute Error loss
+=head1 DESCRIPTION
+
+    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
+    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
+    >>> $mean_absolute_error = mx->metric->MAE()
+    >>> $mean_absolute_error->update($labels, $predicts)
+    >>> print $mean_absolute_error->get()
+    ('mae', 0.5)
+
+=cut
+
+
 package AI::MXNet::MAE;
 use Mouse;
 use AI::MXNet::Base;
@@ -463,7 +746,20 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
     }
 }
 
-# Calculate Mean Squared Error loss
+=head1 NAME
+
+    AI::MXNet::MSE - Calculate Mean Squared Error loss
+=head1 DESCRIPTION
+
+    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
+    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
+    >>> $mean_squared_error = mx->metric->MSE()
+    >>> $mean_squared_error->update($labels, $predicts)
+    >>> print $mean_squared_error->get()
+    ('mse', 0.375)
+
+=cut
+
 package AI::MXNet::MSE;
 use Mouse;
 use AI::MXNet::Base;
@@ -486,7 +782,20 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
     }
 }
 
-# Calculate Root Mean Squred Error loss
+=head1 NAME
+
+    AI::MXNet::RMSE - Calculate Root Mean Squred Error loss
+=head1 DESCRIPTION
+
+    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
+    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
+    >>> $root_mean_squared_error = mx->metric->RMSE()
+    >>> $root_mean_squared_error->update($labels, $predicts)
+    >>> print $root_mean_squared_error->get()
+    'rmse', 0.612372457981
+
+=cut
+
 package AI::MXNet::RMSE;
 use Mouse;
 use AI::MXNet::Base;
@@ -509,6 +818,21 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
     }
 }
 
+
+=head1 NAME
+
+    AI::MXNet::CrossEntropy - Calculate Cross Entropy loss
+=head1 DESCRIPTION
+
+    >>> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> $labels   = [mx->nd->array([0, 1, 1])]
+    >>> $ce = mx->metric->CrossEntropy()
+    >>> $ce->update($labels, $predicts)
+    >>> print $ce->get()
+    ('cross-entropy', 0.57159948348999023)
+
+=cut
+
 # Calculate Cross Entropy loss
 package AI::MXNet::CrossEntropy;
 use Mouse;
@@ -537,6 +861,26 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
     }
 }
 
+=head1 NAME
+
+    AI::MXNet::NegativeLogLikelihood - Computes the negative log-likelihood loss.
+=head1 DESCRIPTION
+
+    >>> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> $labels   = [mx->nd->array([0, 1, 1])]
+    >>> $nll_loss = mx->metric->NegativeLogLikelihood
+    >>> $nll_loss->update($labels, $predicts)
+    >>> print $nll_loss->get()
+    ('cross-entropy', 0.57159948348999023)
+
+=cut
+
+package AI::MXNet::NegativeLogLikelihood;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::CrossEntropy';
+has '+name'   => (default => 'nll_loss');
+
 package AI::MXNet::PearsonCorrelation;
 use Mouse;
 use AI::MXNet::Base;
@@ -545,7 +889,7 @@ has '+name'   => (default => 'pearson-correlation');
 
 =head1 NAME
 
-    AI::MXNet::PearsonCorrelation
+    AI::MXNet::PearsonCorrelation - Computes Pearson correlation.
 =cut
 
 =head1 DESCRIPTION
@@ -594,7 +938,7 @@ has '+name'   => (default => 'loss');
 
 =head1 NAME
 
-    AI::MXNet::Loss
+    AI::MXNet::Loss - Dummy metric for directly printing loss.
 =cut
 
 =head1 DESCRIPTION
@@ -621,7 +965,7 @@ use Mouse;
 
 =head1 NAME
 
-    AI::MXNet::Confidence
+    AI::MXNet::Confidence - Accuracy by confidence buckets.
 =cut
 
 =head1 DESCRIPTION
@@ -717,7 +1061,7 @@ sub get
 
 =head1 NAME
 
-    AI::MXNet::CustomMetric
+    AI::MXNet::CustomMetric - Custom evaluation metric that takes a sub ref.
 =cut
 
 =head1 DESCRIPTION
@@ -779,7 +1123,9 @@ my %metrics = qw/
     accuracy            AI::MXNet::Accuracy
     ce                  AI::MXNet::CrossEntropy
     crossentropy        AI::MXNet::CrossEntropy
+    nll_loss            AI::MXNet::NegativeLogLikelihood
     f1                  AI::MXNet::F1
+    mcc                 AI::MXNet::MCC
     mae                 AI::MXNet::MAE
     mse                 AI::MXNet::MSE
     rmse                AI::MXNet::RMSE
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
index 16c9a92d73ae..38c2ae645969 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
@@ -268,28 +268,28 @@ method BucketingModule(@args) { return AI::MXNet::Module::Bucketing->new(@args)
 
         Parameters
         ----------
-        prefix : str
+        $prefix : Str
             path prefix of saved model files. You should have
             "prefix-symbol.json", "prefix-xxxx.params", and
             optionally "prefix-xxxx.states", where xxxx is the
             epoch number.
-        epoch : int
+        $epoch : Int
             epoch to load.
-        load_optimizer_states : bool
+        $load_optimizer_states=0 : Bool
             whether to load optimizer states. Checkpoint needs
             to have been made with save_optimizer_states=True.
-        data_names : array ref of str
+        :$data_names : array ref of str
             Default is ['data'] for a typical model used in image classification.
-        label_names : array ref of str
+        :$label_names : array ref of str
             Default is ['softmax_label'] for a typical model used in image
             classification.
-        logger : Logger
+        :$logger : Logger
             Default is AI::MXNet::Logging.
-        context : Context or list of Context
+        :$context : Context or list of Context
             Default is cpu(0).
-        work_load_list : array ref of number
+        :$work_load_list : array ref of number
             Default is undef, indicating an uniform workload.
-        fixed_param_names: array ref of str
+        :$fixed_param_names: array ref of str
             Default is undef, indicating no network parameters are fixed.
 =cut
 
@@ -319,11 +319,11 @@ method load(
 
     Parameters
     ----------
-    prefix : str
+    $prefix : Str
         The file prefix to checkpoint to
-    epoch : int
+    $epoch : Int
         The current epoch number
-    save_optimizer_states : bool
+    $save_optimizer_states=0 : Bool
         Whether to save optimizer states for later training
 =cut
 
@@ -348,16 +348,16 @@ method save_checkpoint(Str $prefix, Int $epoch, Bool $save_optimizer_states=0)
 
     Parameters
     ----------
-    prefix : str
+    $prefix : Str
         Prefix of model name.
-    epoch : int
+    $epoch : Int
         The epoch number of the model.
-    symbol : AI::MXNet::Symbol
+    $symbol : AI::MXNet::Symbol
         The input symbol
-    arg_params : hash ref of str to AI::MXNet::NDArray
-        Model parameter, hash ref of name to AI::MXNet::NDArray of net's weights.
-    aux_params : hash ref of str to NDArray
-        Model parameter, hash ref of name to AI::MXNet::NDArray of net's auxiliary states.
+    $arg_params : HashRef[AI::MXNet::NDArray]
+        Model's parameters, hash ref of name to AI::MXNet::NDArray of net's weights.
+    $aux_params : HashRef[AI::MXNet::NDArray]
+        Model's parameters, hash ref of name to AI::MXNet::NDArray of net's auxiliary states.
     Notes
     -----
     - prefix-symbol.json will be saved for symbol.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
index 0e46c31348a3..76fdfd24e7e8 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
@@ -30,13 +30,13 @@ use AI::MXNet::Base;
 
     Parameters
     ----------
-    interval : int
+    interval : Int
         Number of batches between printing.
-    stat_func : function
+    stat_func : CodeRef
         a function that computes statistics of tensors.
         Takes a NDArray and returns a NDArray. defaults to mean
         absolute value |x|/size(x).
-    pattern : str
+    pattern : Str
         A regular expression specifying which tensors to monitor.
         Only tensors with names that match name_pattern will be included.
         For example, '.*weight|.*output' will print all weights and outputs;
@@ -94,7 +94,7 @@ has 'stat_helper'          => (
 
     Parameters
     ----------
-    exe : AI::MXNet::Executor
+    $exe : AI::MXNet::Executor
         the Executor (returned by $symbol->bind) to install to.
 =cut
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 3177a3705941..873953192933 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -22,6 +22,41 @@ package AI::MXNet::NDArray;
     AI::MXNet::NDArray - Multidimensional tensor object of MXNet.
 =cut
 
+=head1 DESCRIPTION
+
+    AI::MXNet::NDArray - Imperative tensor operations on CPU/GPU
+    In AI::MXNet, NDArray is the core data structure for all mathematical computations.
+    An NDArray represents a multidimensional, fixed-size homogenous array.
+    If you’re familiar with the PDL, you might notice some similarities.
+    However, NDArray is row-major, unlike the PDL that is column-major.
+    Like the PDL, MXNet’s NDArray enables imperative computation.
+
+    Some NDArray advandages compared to PDL:
+    MXNet’s NDArray supports fast execution on a wide range of hardware configurations, including CPU, GPU, and multi-GPU machines.
+    MXNet also scales to distributed systems in the cloud.
+    MXNet’s NDArray executes code lazily, allowing it to automatically parallelize multiple operations across the available hardware.
+
+    An NDArray is a multidimensional array of numbers with the same type.
+    We could represent the coordinates of a point in 3D space, e.g. [2, 1, 6] as a 1D array with shape (3).
+    Similarly, we could represent a 2D array.
+    Below, we present an array with length 2 along the first axis and length 3 along the second axis.
+
+    [[0, 1, 2]
+     [3, 4, 5]]
+    Note that here the use of “dimension” is overloaded. When we say a 2D array, we mean an array with 2 axes, not an array with two components.
+
+    Each NDArray supports some important attributes that you’ll often want to query:
+
+    $ndarray->shape: The dimensions of the array.
+    It is an array ref of integers indicating the length of the array along each axis.
+    For a matrix with $n rows and $m columns, its shape will be [$n, $m].
+    $ndarray->dtype: A string describing the type of its elements.
+    Dtype (defined in AI::MXNet::Types) is one of (float32 float64 float16 uint8 int8 int32 int64)
+    $ndarray->size: The total number of components in the array - equal to the product of the components of its shape.
+    $ndarray->context: The device on which this array is stored, represented by an object of AI::MXNet::Context class, e.g. cpu() or gpu(1).
+
+=cut
+
 use strict;
 use warnings;
 use AI::MXNet::Base;
@@ -693,35 +728,6 @@ method onehot_encode(AI::MXNet::NDArray $indices, AI::MXNet::NDArray $out)
     return __PACKAGE__->_onehot_encode($indices, $out, { out => $out });
 }
 
-=head2 _ufunc_helper(lhs, rhs, fn_array, lfn_scalar, rfn_scalar):
-
-    Helper function for element-wise operation
-    The function will perform numpy-like broadcasting if needed and call different functions
-
-    Parameters
-    ----------
-    lhs : NDArray or numeric value
-        left hand side operand
-
-    rhs : NDArray or numeric value
-        right hand side operand
-
-    fn_array : function
-        function to be called if both lhs and rhs are of NDArray type
-
-    lfn_scalar : function
-        function to be called if lhs is NDArray while rhs is numeric value
-
-    rfn_scalar : function
-        function to be called if lhs is numeric value while rhs is NDArray;
-        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
-
-    Returns
-    -------
-    out: NDArray
-        result array
-=cut
-
 sub  _ufunc_helper
 {
     my ($lhs, $rhs, $fn_array, $lfn_scalar, $rfn_scalar, $reverse) = @_;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm
index bb5171c238b6..e0257fd0238b 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm
@@ -356,9 +356,6 @@ extends 'AI::MXNet::NDArray::Sparse';
     csr_matrix: Several ways to construct a CSRNDArray
 =cut
 
-#    def __reduce__(self):
-#        return CSRNDArray, (None,), super(CSRNDArray, self).__getstate__()
-
 use overload '+=' => sub { ($_[0] + $_[1])->copyto($_[0]) },
              '-=' => sub { ($_[0] - $_[1])->copyto($_[0]) },
              '*=' => sub { ($_[0] * $_[1])->copyto($_[0]) },
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index fd1316478db1..ad0e45503220 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -63,14 +63,14 @@ method register()
 
         Parameters
         ----------
-        name: str
+        $name: Str
             Name of required optimizer. Should be the name
             of a subclass of Optimizer. Case insensitive.
 
-        rescale_grad : float
+        :$rescale_grad : Num
             Rescaling factor on gradient. Normally should be 1/batch_size.
 
-        kwargs: dict
+        %kwargs: Hash
             Parameters for optimizer
 
         Returns
@@ -290,25 +290,25 @@ method _get_wd(Index $index)
 
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         learning_rate of SGD
 
-    momentum : float, optional
+    momentum : Num, optional
        momentum value
 
-    wd : float, optional
+    wd : Num, optional
         L2 regularization coefficient add to all the weights
 
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 
-    param_idx2name : hash of string/int to float, optional
+    param_idx2name : hash ref of Str/Int to Num, optional
         special treat weight decay in parameter ends with bias, gamma, and beta
 
-    multi_precision: bool, optional
+    multi_precision: Bool, optional
         Flag to control the internal precision of the optimizer.
         False results in using the same precision as the weights (default),
         True makes internal 32-bit copy of the weights and applies gradients
@@ -438,18 +438,15 @@ __PACKAGE__->register;
 
     See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
 
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
     This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
+    by AI::MXNet::Optimizer
 
     Parameters
     ----------
-    momentum : float, optional
+    momentum : Num, optional
        The momentum value.
-    wd_lh : float, optional
-       The amount of decoupled weight decay regularization, see details in the original paper at:\
+    wd_lh : Num, optional
+       The amount of decoupled weight decay regularization, see details in the original paper at:
        https://arxiv.org/abs/1711.05101
 =cut
 
@@ -536,11 +533,11 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    beta1 : float, optional
+    beta1 : Num, optional
         0 < beta1 < 1. Generally close to 0.5.
-    beta2 : float, optional
+    beta2 : Num, optional
         0 < beta2 < 1. Generally close to 1.
-    epsilon : float, optional
+    epsilon : Num, optional
         Small value to avoid division by 0.
 =cut
 
@@ -604,12 +601,12 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    momentum : float, optional
+    momentum : Num, optional
        The momentum value.
-    multi_precision: bool, optional
+    multi_precision: Bool, optional
        Flag to control the internal precision of the optimizer.
-       ``False`` results in using the same precision as the weights (default),
-       ``True`` makes internal 32-bit copy of the weights and applies gradients
+       0 results in using the same precision as the weights (default),
+       1 makes internal 32-bit copy of the weights and applies gradients
                 in 32-bit precision even if actual weights used in the model have lower precision.`<
                 Turning this on can improve convergence and accuracy when training with float16.
     warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
@@ -896,26 +893,26 @@ extends 'AI::MXNet::Optimizer';
 
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         learning_rate of SGD
 
-    momentum : float, optional
+    momentum : Num, optional
        momentum value
 
-    lamda : float, optional
+    lamda : NUm, optional
        scale DC value
 
-    wd : float, optional
+    wd : Num, optional
         L2 regularization coefficient add to all the weights
 
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 
-    param_idx2name : hash ref of string/int to float, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
+    param_idx2name : hash ref of Str/Int to Num, optional
+        special threating of weight decay for parameters that end with bias, gamma, and beta
 =cut
 has 'momentum'        => (is => 'ro', isa => 'Num', default => 0);
 has 'lamda'           => (is => 'ro', isa => 'Num', default => 0.04);
@@ -1091,16 +1088,16 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         learning_rate of SGD
 
-    wd : float, optional
+    wd : Num, optional
         L2 regularization coefficient add to all the weights
 
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 
@@ -1158,29 +1155,26 @@ __PACKAGE__->register;
        *Adam: A Method for Stochastic Optimization*,
        http://arxiv.org/abs/1412.6980
 
-    the code in this class was adapted from
-    https://github.com/mila-udem/blocks/blob/master/blocks/algorithms/__init__.py#L765
-
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
-    beta1 : float, optional
+    beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
         Default value is set to 0.9.
-    beta2 : float, optional
+    beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
         Default value is set to 0.999.
-    epsilon : float, optional
+    epsilon : Num, optional
         Default value is set to 1e-8.
 
-    wd : float, optional
+    wd : NUm, optional
         L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::Adam;
@@ -1271,21 +1265,21 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         Step size.
         Default value is set to 0.05.
 
-    wd : float, optional
+    wd : Num, optional
         L2 regularization coefficient add to all the weights
 
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    eps: float, optional
+    eps: Num, optional
         A small float number to make the updating processing stable
         Default value is set to 1e-7.
 
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::AdaGrad;
@@ -1361,27 +1355,27 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    learning_rate : float, optional
+    learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
-    gamma1: float, optional
+    gamma1: Num, optional
         decay factor of moving average for gradient^2.
         Default value is set to 0.9.
-    gamma2: float, optional
+    gamma2: Num, optional
         "momentum" factor.
         Default value if set to 0.9.
         Only used if centered=True
-    epsilon : float, optional
+    epsilon : Num, optional
         Default value is set to 1e-8.
-    centered : bool, optional
+    centered : Bool, optional
         Use Graves or Tielemans & Hintons version of RMSProp
-    wd : float, optional
+    wd : Num, optional
         L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient.
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : float, optional
+    clip_weights : Num, optional
         clip weights in range [-clip_weights, clip_weights]
 =cut
 
@@ -1508,15 +1502,15 @@ __PACKAGE__->register;
 
     Parameters
     ----------
-    rho: float
+    rho: Num
         Decay rate for both squared gradients and delta x
-    epsilon : float
+    epsilon : Num
         The constant as described in the thesis
-    wd : float
+    wd : Num
         L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
+    rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
-    clip_gradient : float, optional
+    clip_gradient : Num, optional
         clip gradient in range [-clip_gradient, clip_gradient]
 =cut
 package AI::MXNet::AdaDelta;
@@ -1614,18 +1608,14 @@ package AI::MXNet::Ftrl;
     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.
 
-    eta :
-        .. math::
-           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
-
-    The optimizer updates the weight by::
+    The optimizer updates the weight by:
 
         rescaled_grad = clip(grad * rescale_grad, clip_gradient)
         z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
         n += rescaled_grad**2
         w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
 
-    If the storage types of weight, state and grad are all ``row_sparse``, \
+    If the storage types of weight, state and grad are all row_sparse,
     **sparse updates** are applied by::
 
         for row in grad.indices:
@@ -1641,18 +1631,16 @@ package AI::MXNet::Ftrl;
     provides slightly different semantics than the original update, and
     may lead to different empirical results.
 
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
-
     This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
+    by AI::MXNet::Optimizer
 
     Parameters
     ----------
-    lamda1 : float, optional
+    lamda1 : Num, optional
         L1 regularization coefficient.
-    learning_rate : float, optional
+    learning_rate : Num, optional
         The initial learning rate.
-    beta : float, optional
+    beta : Num, optional
         Per-coordinate learning rate correlation parameter.
 =cut
 
@@ -1720,9 +1708,9 @@ package AI::MXNet::Adamax;
 
     Parameters
     ----------
-    beta1 : float, optional
+    beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
+    beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
 =cut
 
@@ -1798,17 +1786,17 @@ package AI::MXNet::Nadam;
     at http://cs229.stanford.edu/proj2015/054_report.pdf.
 
     This optimizer accepts the following parameters in addition to those accepted
-    AI::MXNet::Optimizer.
+    by AI::MXNet::Optimizer.
 
     Parameters
     ----------
-    beta1 : float, optional
+    beta1 : Num, optional
         Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
+    beta2 : Num, optional
         Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
+    epsilon : Num, optional
         Small value to avoid division by 0.
-    schedule_decay : float, optional
+    schedule_decay : Num, optional
         Exponential decay rate for the momentum schedule
 =cut
 
@@ -1879,7 +1867,11 @@ method update(
 
 __PACKAGE__->register;
 
-# updater for kvstore
+=head1 NAME
+
+    AI::MXNet::Updater - Updater for kvstore
+=cut
+
 package AI::MXNet::Updater;
 use Mouse;
 use Storable qw(thaw freeze);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
index f2d8b5369e99..9dd88cbb029e 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
@@ -21,7 +21,7 @@ use AI::MXNet::Function::Parameters;
 
 =head1 NAME
 
-    AI::MXNet::RNN::Params
+    AI::MXNet::RNN::Params - A container for holding variables.
 =cut
 
 =head1 DESCRIPTION
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index bccf483d4367..57bfdf1d977c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -528,7 +528,7 @@ method list_inputs()
 =cut
 
 
-method infer_type(Str|Undef @args)
+method infer_type(Maybe[Str] @args)
 {
     my ($positional_arguments, $kwargs, $kwargs_order) = _parse_arguments("Dtype", @args);
     my $sdata = [];
@@ -1370,6 +1370,7 @@ method load(Str $fname)
 }
 
 =head2 load_json
+
     Load symbol from json string.
 
     Parameters
@@ -1469,12 +1470,12 @@ sub _parse_arguments
             }
             else
             {
-                confess("Argument need to be of type $type");
+                confess("Argument needs to be of type $type");
             }
         }
         else
         {
-            confess("Argument need to be one type $type");
+            confess("Argument needs to be one type $type");
         }
     }
     return (\@positional_arguments, \%kwargs, \@kwargs_order);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
index 2cb20b7c5610..d668decc6918 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
@@ -32,7 +32,7 @@ use AI::MXNet::Function::Parameters;
 
 =head1 DESCRIPTION
 
-    A convenience class that loads all C++m symbol related functions at runtime.
+    A convenience class that loads all C++ symbol related functions at runtime.
 =cut
 
 my %function_meta;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
index 95ea8a6f49ea..0126655186fd 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
@@ -21,7 +21,11 @@ use warnings;
 use Mouse;
 use AI::MXNet::Function::Parameters;
 
-=head1
+=head1 NAME
+
+    AI::MXNet::Symbol::NameManager - Automated symbol naming.
+
+=head1 DESCRIPTION
 
     NameManager that does an automatic naming.
 
diff --git a/perl-package/AI-MXNet/t/test_gluon.t b/perl-package/AI-MXNet/t/test_gluon.t
index 32127229b95f..545cb7b3f882 100644
--- a/perl-package/AI-MXNet/t/test_gluon.t
+++ b/perl-package/AI-MXNet/t/test_gluon.t
@@ -1164,7 +1164,7 @@ sub test_zero_grad
         $net->($data)->backward;
     });
     $net->collect_params->zero_grad;
-    my $grad = $net->collect_params->{test_zero_grad_weight}->grad;
+    my $grad = $net->collect_params->params->get('test_zero_grad_weight')->grad;
     ok(almost_equal($grad->aspdl, $grad->aspdl * 0));
 }
 
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
index 8dad8b455364..938b8e268f1d 100644
--- a/perl-package/AI-MXNetCAPI/Changes
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNetCAPI
 
+1.32    Sun Aug  5 14:25:31 PDT 2018
+        - Bugfixes.
+
 1.3     Tue Jun 26 20:57:40 PDT 2018
         - Major update, Gluon interface updated to parity with Python's API
 
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
index 35271e3edaab..854023559c62 100644
--- a/perl-package/AI-MXNetCAPI/META.json
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.3"
+   "version" : "1.32"
 }
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index 48760da13629..1db34c501d8c 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -19,4 +19,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.3'
+version: '1.32'
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
index dca8b4a1ee06..f5881ff2db07 100644
--- a/perl-package/AI-MXNetCAPI/README
+++ b/perl-package/AI-MXNetCAPI/README
@@ -1,4 +1,4 @@
-AI-MXNetCAPI version 1.3
+AI-MXNetCAPI version 1.32
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
index b578507277d2..e371219b0ae6 100644
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -18,7 +18,7 @@
 package AI::MXNetCAPI;
 use base qw(DynaLoader);
 bootstrap AI::MXNetCAPI;
-our $VERSION = '1.3';
+our $VERSION = '1.32';
 1;
 __END__
 
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 4d9177a000a0..68e11ca74e1a 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -1215,5 +1215,5 @@
 
 %typemap(in) (void* callback_handle)
 {
-    $1 = (void*)$input;
+    $1 = (void*)newSVsv($input);
 }
diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
index fbfd3469678b..606bb0ada54f 100644
--- a/python/mxnet/contrib/__init__.py
+++ b/python/mxnet/contrib/__init__.py
@@ -32,3 +32,4 @@
 from . import io
 from . import quantization
 from . import quantization as quant
+from . import tensorrt
diff --git a/python/mxnet/contrib/tensorrt.py b/python/mxnet/contrib/tensorrt.py
new file mode 100644
index 000000000000..4ff39c4b4829
--- /dev/null
+++ b/python/mxnet/contrib/tensorrt.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Module to enable the use of TensorRT optimized graphs."""
+
+import ctypes
+import logging
+import os
+
+from .. import symbol as sym
+
+from ..base import _LIB, SymbolHandle, MXNetError
+from ..base import check_call
+
+
+def set_use_tensorrt(status):
+    """
+    Set an environment variable which will enable or disable the use of TensorRT in the backend.
+    Note: this is useful for A/B testing purposes.
+    :param status: Boolean, true if TensorRT optimization should be applied, False for legacy
+    behaviour.
+    """
+    os.environ["MXNET_USE_TENSORRT"] = str(int(status))
+
+
+def get_use_tensorrt():
+    """
+    Get an environment variable which describes if TensorRT is currently enabled in the backend.
+    Note: this is useful for A/B testing purposes.
+    :return: Boolean, true if TensorRT optimization should be applied, False for legacy
+    behaviour.
+    """
+    return bool(int(os.environ.get("MXNET_USE_TENSORRT", 0)) == 1)
+
+
+def get_optimized_symbol(executor):
+    """
+    Take an executor's underlying symbol graph and return its generated optimized version.
+
+    Parameters
+    ----------
+    executor :
+        An executor for which you want to see an optimized symbol. Getting an optimized symbol
+        is useful to compare and verify the work TensorRT has done against a legacy behaviour.
+
+    Returns
+    -------
+    symbol : nnvm::Symbol
+        The nnvm symbol optimized.
+    """
+    handle = SymbolHandle()
+    try:
+        check_call(_LIB.MXExecutorGetOptimizedSymbol(executor.handle, ctypes.byref(handle)))
+        result = sym.Symbol(handle=handle)
+        return result
+    except MXNetError:
+        logging.error('Error while trying to fetch TRT optimized symbol for graph. Please ensure '
+                      'build was compiled with MXNET_USE_TENSORRT enabled.')
+        raise
+
+
+def tensorrt_bind(symbol, ctx, all_params, type_dict=None, stype_dict=None, group2ctx=None,
+                  **kwargs):
+    """Bind current symbol to get an optimized trt executor.
+
+    Parameters
+    ----------
+    symbol : Symbol
+        The symbol you wish to bind, and optimize with TensorRT.
+
+    ctx : Context
+        The device context the generated executor to run on.
+
+    all_params : Dict of str->ndarray
+        A dictionary of mappings from parameter names to parameter NDArrays.
+
+    type_dict  : Dict of str->numpy.dtype
+        Input type dictionary, name->dtype
+
+    stype_dict  : Dict of str->str
+        Input storage type dictionary, name->storage_type
+
+    group2ctx : Dict of string to mx.Context
+        The dict mapping the `ctx_group` attribute to the context assignment.
+
+    kwargs : Dict of str->shape
+        Input shape dictionary, name->shape
+
+    Returns
+    -------
+    executor : mxnet.Executor
+        An optimized TensorRT executor.
+    """
+    kwargs['shared_buffer'] = all_params
+    return symbol.simple_bind(ctx, type_dict=type_dict, stype_dict=stype_dict,
+                              group2ctx=group2ctx, **kwargs)
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index c0272c5bb433..fcd5406236e9 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -73,6 +73,7 @@ def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
         self.aux_arrays = []
         self.outputs = self._get_outputs()
         self._symbol = copy.deepcopy(symbol)
+        self._optimized_symbol = None
         self._arg_dict = None
         self._grad_dict = None
         self._aux_dict = None
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index f58b00dded2e..d0830dcc8cae 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -208,26 +208,27 @@ def __setattr__(self, name, value):
         super(Block, self).__setattr__(name, value)
 
     def _check_container_with_block(self):
-        def _find_block_in_container(data):
+        children = set(self._children.values())
+        def _find_unregistered_block_in_container(data):
             # Find whether a nested container structure contains Blocks
             if isinstance(data, (list, tuple)):
                 for ele in data:
-                    if _find_block_in_container(ele):
+                    if _find_unregistered_block_in_container(ele):
                         return True
                 return False
             elif isinstance(data, dict):
                 for _, v in data.items():
-                    if _find_block_in_container(v):
+                    if _find_unregistered_block_in_container(v):
                         return True
                 return False
             elif isinstance(data, Block):
-                return True
+                return not data in children
             else:
                 return False
         for k, v in self.__dict__.items():
             if isinstance(v, (list, tuple, dict)) and not (k.startswith('__') or k == '_children'):
-                if _find_block_in_container(v):
-                    warnings.warn('"{name}" is a container with Blocks. '
+                if _find_unregistered_block_in_container(v):
+                    warnings.warn('"{name}" is an unregistered container with Blocks. '
                                   'Note that Blocks inside the list, tuple or dict will not be '
                                   'registered automatically. Make sure to register them using '
                                   'register_child() or switching to '
@@ -656,10 +657,12 @@ def _summary_hook(block, _, outputs):
                 trainable_params += summary[layer]['trainable']
                 shared_params += summary[layer]['shared']
             print('='*80)
-            print('Total params: ' + str(total_params))
-            print('Trainable params: ' + str(trainable_params))
-            print('Non-trainable params: ' + str(total_params - trainable_params))
-            print('Shared params: ' + str(shared_params))
+            print('Parameters in forward computation graph, duplicate included')
+            print('   Total params: ' + str(total_params))
+            print('   Trainable params: ' + str(trainable_params))
+            print('   Non-trainable params: ' + str(total_params - trainable_params))
+            print('Shared params in forward computation graph: ' + str(shared_params))
+            print('Unique parameters in model: ' + str(total_params - shared_params))
             print('-'*80)
         finally:
             for h in hooks:
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index eb1eb419cd02..412d3134476b 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=
+# pylint: disable=ungrouped-imports
 """Dataset generator."""
 __all__ = ['DataLoader']
 
@@ -26,6 +26,7 @@
 import multiprocessing
 import multiprocessing.queues
 from multiprocessing.reduction import ForkingPickler
+import threading
 import numpy as np
 
 try:
@@ -149,9 +150,18 @@ def default_mp_batchify_fn(data):
                         ctx=context.Context('cpu_shared', 0))
 
 
+def _as_in_context(data, ctx):
+    """Move data into new context."""
+    if isinstance(data, nd.NDArray):
+        return data.as_in_context(ctx)
+    elif isinstance(data, (list, tuple)):
+        return [_as_in_context(d, ctx) for d in data]
+    return data
+
 def worker_loop(dataset, key_queue, data_queue, batchify_fn):
     """Worker loop for multiprocessing DataLoader."""
-    dataset._fork()
+    if hasattr(dataset, '_fork') and callable(dataset._fork):
+        dataset._fork()
     while True:
         idx, samples = key_queue.get()
         if idx is None:
@@ -159,9 +169,22 @@ def worker_loop(dataset, key_queue, data_queue, batchify_fn):
         batch = batchify_fn([dataset[i] for i in samples])
         data_queue.put((idx, batch))
 
+def fetcher_loop(data_queue, data_buffer, pin_memory=False):
+    """Fetcher loop for fetching data from queue and put in reorder dict."""
+    while True:
+        idx, batch = data_queue.get()
+        if idx is None:
+            break
+        if pin_memory:
+            batch = _as_in_context(batch, context.cpu_pinned())
+        else:
+            batch = _as_in_context(batch, context.cpu())
+        data_buffer[idx] = batch
+
 class _MultiWorkerIter(object):
     """Interal multi-worker iterator for DataLoader."""
-    def __init__(self, num_workers, dataset, batchify_fn, batch_sampler):
+    def __init__(self, num_workers, dataset, batchify_fn, batch_sampler, pin_memory=False,
+                 worker_fn=worker_loop):
         assert num_workers > 0, "_MultiWorkerIter is not for {} workers".format(num_workers)
         self._num_workers = num_workers
         self._dataset = dataset
@@ -178,12 +201,18 @@ def __init__(self, num_workers, dataset, batchify_fn, batch_sampler):
         workers = []
         for _ in range(self._num_workers):
             worker = multiprocessing.Process(
-                target=worker_loop,
+                target=worker_fn,
                 args=(self._dataset, self._key_queue, self._data_queue, self._batchify_fn))
             worker.daemon = True
             worker.start()
             workers.append(worker)
 
+        self._fetcher = threading.Thread(
+            target=fetcher_loop,
+            args=(self._data_queue, self._data_buffer, pin_memory))
+        self._fetcher.daemon = True
+        self._fetcher.start()
+
         # pre-fetch
         for _ in range(2 * self._num_workers):
             self._push_next()
@@ -210,13 +239,11 @@ def __next__(self):
             raise StopIteration
 
         while True:
-            self._push_next()
             if self._rcvd_idx in self._data_buffer:
                 batch = self._data_buffer.pop(self._rcvd_idx)
                 self._rcvd_idx += 1
+                self._push_next()
                 return batch
-            idx, batch = self._data_queue.get()
-            self._data_buffer[idx] = batch
 
     def next(self):
         return self.__next__()
@@ -229,11 +256,7 @@ def shutdown(self):
         if not self._shutdown:
             for _ in range(self._num_workers):
                 self._key_queue.put((None, None))
-            try:
-                while not self._data_queue.empty():
-                    self._data_queue.get()
-            except IOError:
-                pass
+            self._data_queue.put((None, None))
             self._shutdown = True
 
 
@@ -277,12 +300,16 @@ def default_batchify_fn(data):
 
     num_workers : int, default 0
         The number of multiprocessing workers to use for data preprocessing.
-        `num_workers > 0` is not supported on Windows yet.
+    pin_memory : boolean, default False
+        If ``True``, the dataloader will copy NDArrays into pinned memory
+        before returning them. Copying from CPU pinned memory to GPU is faster
+        than from normal CPU memory.
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                  last_batch=None, batch_sampler=None, batchify_fn=None,
-                 num_workers=0):
+                 num_workers=0, pin_memory=False):
         self._dataset = dataset
+        self._pin_memory = pin_memory
 
         if batch_sampler is None:
             if batch_size is None:
@@ -315,13 +342,17 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
 
     def __iter__(self):
         if self._num_workers == 0:
-            generator = lambda: [(yield self._batchify_fn([self._dataset[idx] for idx in batch]))
-                                 for batch in self._batch_sampler]
-            return generator()
+            def same_process_iter():
+                for batch in self._batch_sampler:
+                    ret = self._batchify_fn([self._dataset[idx] for idx in batch])
+                    if self._pin_memory:
+                        ret = _as_in_context(ret, context.cpu_pinned())
+                    yield ret
+            return same_process_iter()
 
         # multi-worker
         return _MultiWorkerIter(self._num_workers, self._dataset,
-                                self._batchify_fn, self._batch_sampler)
+                                self._batchify_fn, self._batch_sampler, self._pin_memory)
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index 422301a6a483..fa8eee9d2989 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -176,11 +176,9 @@ class SELU(HybridBlock):
     """
     def __init__(self, **kwargs):
         super(SELU, self).__init__(**kwargs)
-        self._scale = 1.0507009873554804934193349852946
-        self._alpha = 1.6732632423543772848170429916717
 
     def hybrid_forward(self, F, x):
-        return self._scale * F.where(x > 0, x, self._alpha * (F.exp(x) - 1.0))
+        return F.LeakyReLU(x, act_type='selu', name='fwd')
 
 
 class Swish(HybridBlock):
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index ad69d4e9dd90..d26841977ac2 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -427,7 +427,7 @@ def __init__(self, **kwargs):
         super(Flatten, self).__init__(**kwargs)
 
     def hybrid_forward(self, F, x):
-        return x.reshape((0, -1))
+        return F.Flatten(x)
 
     def __repr__(self):
         return self.__class__.__name__
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 0c6aae921352..1f6b86c978c6 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -319,7 +319,7 @@ def _reduce(self):
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
             data = ndarray.zeros(self.shape, stype='row_sparse', ctx=ctx)
-            self._trainer._row_sparse_pull(self, data, all_row_ids)
+            self._trainer._row_sparse_pull(self, data, all_row_ids, full_idx=True)
         return data
 
     def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 4a7a0be2bc30..d2c6ac9d9f2f 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -21,6 +21,8 @@
 # pylint: disable=too-many-lines, arguments-differ
 """Definition of various recurrent neural network layers."""
 from __future__ import print_function
+import re
+
 __all__ = ['RNN', 'LSTM', 'GRU']
 
 from ... import ndarray, symbol
@@ -92,10 +94,17 @@ def __repr__(self):
     def _collect_params_with_prefix(self, prefix=''):
         if prefix:
             prefix += '.'
-        def convert_key(key): # for compatibility with old parameter format
-            key = key.split('_')
-            return '_unfused.{}.{}_cell.{}'.format(key[0][1:], key[0][0], '_'.join(key[1:]))
-        ret = {prefix + convert_key(key) : val for key, val in self._reg_params.items()}
+        pattern = re.compile(r'(l|r)(\d)_(i2h|h2h)_(weight|bias)\Z')
+        def convert_key(m, bidirectional): # for compatibility with old parameter format
+            d, l, g, t = [m.group(i) for i in range(1, 5)]
+            if bidirectional:
+                return '_unfused.{}.{}_cell.{}_{}'.format(l, d, g, t)
+            else:
+                return '_unfused.{}.{}_{}'.format(l, g, t)
+        bidirectional = any(pattern.match(k).group(1) == 'r' for k in self._reg_params)
+
+        ret = {prefix + convert_key(pattern.match(key), bidirectional) : val
+               for key, val in self._reg_params.items()}
         for name, child in self._children.items():
             ret.update(child._collect_params_with_prefix(prefix + name))
         return ret
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 98a6878b94ba..028e66075100 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -235,14 +235,21 @@ def set_learning_rate(self, lr):
         else:
             self._optimizer.set_learning_rate(lr)
 
-    def _row_sparse_pull(self, parameter, out, row_id):
+    def _row_sparse_pull(self, parameter, out, row_id, full_idx=False):
+        """Internal method to invoke pull operations on KVStore. If `full_idx` is set to True,
+        `kv.pull` is preferred instead of `kv.row_sparse_pull`.
+        """
         # initialize kv and params if not already
         if not self._kv_initialized:
             self._init_kvstore()
         if self._params_to_init:
             self._init_params()
         idx = self._param2idx[parameter.name]
-        self._kvstore.row_sparse_pull(idx, out=out, row_ids=row_id, priority=-idx)
+        if full_idx and 'dist' not in self._kvstore.type:
+            assert row_id.size == out.shape[0]
+            self._kvstore.pull(idx, out=out, priority=-idx, ignore_sparse=False)
+        else:
+            self._kvstore.row_sparse_pull(idx, out=out, row_ids=row_id, priority=-idx)
 
     def step(self, batch_size, ignore_stale_grad=False):
         """Makes one step of parameter update. Should be called after
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index ef9026d45075..8ae729f3ccf9 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -697,7 +697,7 @@ class FusedRNN(Initializer):
     def __init__(self, init, num_hidden, num_layers, mode, bidirectional=False, forget_bias=1.0):
         if isinstance(init, string_types):
             klass, kwargs = json.loads(init)
-            init = _INITIALIZER_REGISTRY[klass.lower()](**kwargs)
+            init = registry._REGISTRY[klass.lower()](**kwargs)
         super(FusedRNN, self).__init__(init=init.dumps() if init is not None else None,
                                        num_hidden=num_hidden, num_layers=num_layers, mode=mode,
                                        bidirectional=bidirectional, forget_bias=forget_bias)
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 654e41bf3656..08ab8fa89e49 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -22,6 +22,7 @@
 import time
 import logging
 import warnings
+import numpy as np
 
 from .. import metric
 from .. import ndarray
@@ -29,7 +30,7 @@
 from ..context import cpu
 from ..model import BatchEndParam
 from ..initializer import Uniform
-from ..io import DataDesc
+from ..io import DataDesc, DataIter, DataBatch
 from ..base import _as_list
 
 
@@ -333,7 +334,7 @@ def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True,
 
         Parameters
         ----------
-        eval_data : DataIter
+        eval_data : DataIter or NDArray or numpy array
             Evaluation data to run prediction on.
         num_batch : int
             Defaults to ``None``, indicates running all the batches in the data iterator.
@@ -363,6 +364,15 @@ def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True,
         """
         assert self.binded and self.params_initialized
 
+        if isinstance(eval_data, (ndarray.NDArray, np.ndarray)):
+            if isinstance(eval_data, np.ndarray):
+                eval_data = ndarray.array(eval_data)
+            self.forward(DataBatch([eval_data]))
+            return self.get_outputs()[0]
+
+        if not isinstance(eval_data, DataIter):
+            raise ValueError('eval_data must be of type NDArray or DataIter')
+
         if reset:
             eval_data.reset()
 
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 5d8e95077c40..c4050699bd52 100755
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -592,8 +592,8 @@ def backward(self, out_grads=None):
                     # pylint: disable=no-member
                     og_my_slice = nd.slice_axis(grad, axis=axis, begin=islice.start,
                                                 end=islice.stop)
-                    # pylint: enable=no-member
                     out_grads_slice.append(og_my_slice.as_in_context(self.contexts[i]))
+                    # pylint: enable=no-member
                 else:
                     out_grads_slice.append(grad.copyto(self.contexts[i]))
             exec_.backward(out_grads=out_grads_slice)
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 4d77e0e4d8c0..a7d3336e8439 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -398,7 +398,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
 
         self.for_training = for_training
         self.inputs_need_grad = inputs_need_grad
-        self.binded = True
         self._grad_req = grad_req
 
         if not for_training:
@@ -454,6 +453,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         if shared_module is not None and shared_module.optimizer_initialized:
             self.borrow_optimizer(shared_module)
 
+        self.binded = True
+
     def reshape(self, data_shapes, label_shapes=None):
         """Reshapes the module for new input shapes.
 
diff --git a/python/mxnet/ndarray_doc.py b/python/mxnet/ndarray_doc.py
index 0c51036d8208..9d6258a89a3d 100644
--- a/python/mxnet/ndarray_doc.py
+++ b/python/mxnet/ndarray_doc.py
@@ -105,6 +105,21 @@ class BroadcastToDoc(NDArrayDoc):
     (2L, 2L, 2L, 3L)
     """
 
+class StackDoc(NDArrayDoc):
+    """
+    Example
+    --------
+    Join a sequence of arrays along a new axis.
+    >>> x = mx.nd.array([1, 2])
+    >>> y = mx.nd.array([3, 4])
+    >>> stack(x, y)
+    [[1, 2],
+     [3, 4]]
+    >>> stack(x, y, axis=1)
+    [[1, 3],
+     [2, 4]]
+    """
+
 class CustomDoc(NDArrayDoc):
     """
     Example
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index ab7dadb17a54..b69d0c9af0dc 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,3 +1,4 @@
+# coding: utf-8
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,7 +16,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# coding: utf-8
 # pylint: disable=too-many-lines
 """Weight updating functions."""
 import logging
@@ -548,7 +548,7 @@ def update_multi_precision(self, index, weight, grad, state):
 
 @register
 class Signum(Optimizer):
-    """The Signum optimizer that takes the sign of gradient or momentum.
+    r"""The Signum optimizer that takes the sign of gradient or momentum.
 
     The optimizer updates the weight by::
 
@@ -556,7 +556,11 @@ class Signum(Optimizer):
         state = momentum * state + (1-momentum)*rescaled_grad
         weight = (1 - lr * wd_lh) * weight - lr * sign(state)
 
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
+    Reference:
+    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
+    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
+
+    See: https://arxiv.org/abs/1802.04434
 
     For details of the update algorithm see
     :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 1d42cf7c18f8..38195bd62ffa 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -127,7 +127,7 @@ def _cut_subgraph(subg):
 # This construct a subgraph for given output nodes.
 # If an output node is one of the input nodes, we call identity to make sure
 # that outputs nodes are different from input nodes.
-def _construct_subgraph(sym_out, sym_states):
+def _construct_subgraph(sym_out, sym_states, name):
     sym_out = _as_list(sym_out)
     sym_states = _as_list(sym_states)
     all_outputs = []
@@ -137,18 +137,16 @@ def _construct_subgraph(sym_out, sym_states):
 
     flat_out = []
     all_input_names = g.list_inputs()
-    output_names = [o.name for o in sym_out]
+    output_names = {o.name for o in sym_out}
     for o in sym_out:
-        if o.name in all_input_names:
+        if o.name in all_input_names or o.list_attr().get("__subgraph_name__", "") != name:
             flat_out.append(symbol.op.identity(o))
         else:
             flat_out.append(o)
 
     for s in sym_states:
-        if s.name in all_input_names or s.name in output_names:
-            # There is a problem if the outputs are the same as the inputs
-            # or the first output. By calling identity, we can make sure that
-            # all symbols will refer to different NDArrays.
+        if s.name in all_input_names or s.name in output_names or \
+           s.list_attr().get("__subgraph_name__", "") != name:
             flat_out.append(symbol.op.identity(s))
         else:
             flat_out.append(s)
@@ -256,7 +254,7 @@ def check_data(inputs, in_type, msg):
         num_out_data = len(sym_out)
         num_states = len(sym_states)
         num_outputs = num_out_data + num_states
-        g = _construct_subgraph(sym_out, sym_states)
+        g = _construct_subgraph(sym_out, sym_states, name)
 
     input_syms = _get_graph_inputs(g)
     cut_syms = _cut_subgraph(g)
@@ -469,9 +467,12 @@ def _create_subgraph(graph_vars, graph_func, subgraph_name):
             num_outputs = len(outputs) + len(final_state)
             # nnvm cut-graph does not allow inputs and outputs overlap
             # so we calculate the name of inputs, and copy outputs once it overlaps with inputs
-            all_input_names = symbol.Group(outputs + final_state).list_inputs()
-            make_identity = lambda x: symbol.op.identity(x) if x.name in all_input_names else x
             # group all outputs of graph_func
+            all_input_names = symbol.Group(outputs + final_state).list_inputs()
+            in_input = lambda x: x.name in all_input_names
+            in_graph = lambda x: x.list_attr().get("__subgraph_name__", "") == subgraph_name
+            make_identity = lambda x: symbol.op.identity(x) if in_input(x) or not in_graph(x) \
+                                      else x
             graph = symbol.Group(list(map(make_identity, outputs + final_state)))
         return graph, num_out_data, num_outputs
 
@@ -627,9 +628,12 @@ def _create_subgraph(graph_vars, graph_func, subgraph_name):
             num_outputs = len(outputs)
             # nnvm cut-graph does not allow inputs and outputs overlap
             # so we calculate the name of inputs, and copy outputs once it overlaps with inputs
-            all_input_names = symbol.Group(outputs).list_inputs()
-            make_identity = lambda x: symbol.op.identity(x) if x.name in all_input_names else x
             # group all outputs of graph_func
+            all_input_names = symbol.Group(outputs).list_inputs()
+            in_input = lambda x: x.name in all_input_names
+            in_graph = lambda x: x.list_attr().get("__subgraph_name__", "") == subgraph_name
+            make_identity = lambda x: symbol.op.identity(x) if in_input(x) or not in_graph(x) \
+                                      else x
             graph = symbol.Group(list(map(make_identity, outputs)))
         return graph, num_outputs
 
diff --git a/python/mxnet/symbol/image.py b/python/mxnet/symbol/image.py
index 7624bcced217..db03ca49a300 100644
--- a/python/mxnet/symbol/image.py
+++ b/python/mxnet/symbol/image.py
@@ -19,7 +19,7 @@
 # pylint: disable=wildcard-import, unused-wildcard-import
 """Image Symbol API of MXNet."""
 try:
-    from .gen_iamge import *
+    from .gen_image import *
 except ImportError:
     pass
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index e963d158446d..69d916ef85e3 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -479,10 +479,8 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     """
     rtol = get_rtol(rtol)
     atol = get_atol(atol)
-
     if almost_equal(a, b, rtol, atol, equal_nan=equal_nan):
         return
-
     index, rel = find_max_violation(a, b, rtol, atol)
     np.set_printoptions(threshold=4, suppress=True)
     msg = npt.build_err_msg([a, b],
@@ -1203,10 +1201,10 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
     else:
         raise ValueError('typ can only be "whole" or "forward".')
 
-
 def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                       arg_params=None, aux_params=None, tol=None,
-                      raise_on_err=True, ground_truth=None, equal_nan=False, use_uniform=False):
+                      raise_on_err=True, ground_truth=None, equal_nan=False,
+                      use_uniform=False, rand_type=np.float64):
     """Check symbol gives the same output for different running context
 
     Parameters
@@ -1223,6 +1221,11 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
         Optional, When flag set to true,
         random input data generated follows uniform distribution,
         not normal distribution
+    rand_type: np.dtype
+        casts the randomly generated data to this type
+        Optional, when input data is passed via arg_params,
+        defaults to np.float64 (numpy float default)
+
     Examples
     --------
     >>> # create the symbol
@@ -1283,9 +1286,11 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
     for n, arr in exe_list[0].arg_dict.items():
         if n not in arg_params:
             if use_uniform:
-                arg_params[n] = np.random.uniform(low=-0.92, high=0.92, size=arr.shape)
+                arg_params[n] = np.random.uniform(low=-0.92, high=0.92,
+                                                  size=arr.shape).astype(rand_type)
             else:
-                arg_params[n] = np.random.normal(size=arr.shape, scale=scale)
+                arg_params[n] = np.random.normal(size=arr.shape,
+                                                 scale=scale).astype(rand_type)
     for n, arr in exe_list[0].aux_dict.items():
         if n not in aux_params:
             aux_params[n] = 0
diff --git a/python/setup.py b/python/setup.py
index ec8414c85218..add5e6681fe6 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -89,7 +89,7 @@ def config_cython():
             ret.append(Extension(
                 "mxnet/%s/.%s" % (subdir, fn[:-4]),
                 ["mxnet/cython/%s" % fn],
-                include_dirs=["../include/", "../3rdparty/nnvm/include"],
+                include_dirs=["../include/", "../3rdparty/tvm/nnvm/include"],
                 library_dirs=library_dirs,
                 libraries=libraries,
                 language="c++"))
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ed513c0d7785..1ef3f0fca9f3 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -122,6 +122,12 @@ int MXGetGPUCount(int* out) {
   API_END();
 }
 
+int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) {
+  API_BEGIN();
+  Context::GetGPUMemoryInformation(dev, free_mem, total_mem);
+  API_END();
+}
+
 int MXGetVersion(int *out) {
   API_BEGIN();
   *out = static_cast<int>(MXNET_VERSION);
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 09bc23934e5a..b99350525bfa 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -26,6 +26,10 @@
 #include <mxnet/c_api.h>
 #include <mxnet/executor.h>
 #include "./c_api_common.h"
+#include "../executor/graph_executor.h"
+#if MXNET_USE_TENSORRT
+#include "../executor/trt_graph_executor.h"
+#endif  // MXNET_USE_TENSORRT
 
 int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
   Executor *exec = static_cast<Executor*>(handle);
@@ -439,13 +443,38 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   std::vector<NDArray> in_arg_vec;
   std::vector<NDArray> arg_grad_vec;
   std::vector<NDArray> aux_state_vec;
-
-  *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
-                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
-                              &arg_grad_vec, &aux_state_vec,
-                              use_shared_buffer ? &shared_buffer_map : nullptr,
-                              reinterpret_cast<Executor*>(shared_exec_handle));
+#if MXNET_USE_TENSORRT
+  // If we've built with TensorRT support we by default return an TRTExecutor.
+  // Users can override this behaviour via env var, which is useful for example for A/B
+  // performance testing.
+  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
+    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
+                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
+                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
+                                                &grad_req_type_vec, shared_arg_name_set,
+                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
+                                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                                reinterpret_cast<Executor*>(shared_exec_handle));
+  } else {
+    // Checks to see if this env var has been set to true or false by the user.
+    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
+    // them and describe further steps.
+    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
+    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
+      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
+                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
+                   "to enable.";
+    }
+#endif  // MXNET_USE_TENSORRT
+    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                                &arg_grad_vec, &aux_state_vec,
+                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                reinterpret_cast<Executor*>(shared_exec_handle));
+#if MXNET_USE_TENSORRT
+  }
+#endif  // MXNET_USE_TENSORRT
 
   // copy ndarray ptrs to ret->handles so that front end
   // can access them
@@ -597,6 +626,25 @@ int MXExecutorReshape(int partial_shaping,
   API_END_HANDLE_ERROR(delete out);
 }
 
+int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
+                                 SymbolHandle *out) {
+  auto s = new nnvm::Symbol();
+  API_BEGIN();
+
+#if MXNET_USE_TENSORRT
+  auto exec = static_cast<exec::TrtGraphExecutor*>(handle);
+  *s = exec->GetOptimizedSymbol();
+  *out = s;
+#else
+  LOG(FATAL) << "GetOptimizedSymbol may only be used when MXNet is compiled with "
+                "MXNET_USE_TENSORRT enabled.  Please re-compile MXNet with TensorRT support.";
+#endif  // MXNET_USE_TENSORRT
+
+  API_END_HANDLE_ERROR(delete s);
+}
+
+
+
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                  ExecutorMonitorCallback callback,
                                  void* callback_handle) {
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index c27a59a67c6e..35ecec7e11f6 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -372,13 +372,13 @@ int MXSymbolCutSubgraph(SymbolHandle sym, SymbolHandle **input_symbols,
   // a subgraph.
   API_BEGIN();
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
-  std::string subg_attr = "__subgraph_name__";
+  const std::string subg_attr = "__subgraph_name__";
   auto out_node = s->outputs[0].node;
   auto it = out_node->attrs.dict.find(subg_attr);
   if (it != out_node->attrs.dict.end()) {
-    std::string subg_name = it->second;
+    const std::string &subg_name = it->second;
     std::vector<nnvm::NodeEntry *> input_entries;
-    DFSVisit(s->outputs, [subg_attr, subg_name, &input_entries]
+    DFSVisit(s->outputs, [&subg_attr, &subg_name, &input_entries]
              (nnvm::NodePtr n) {
       // If the node itself isn't in the subgraph, we ignore it.
       auto it = n->attrs.dict.find(subg_attr);
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index becb0cb364f6..d84a89ab2133 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -140,6 +140,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
     }
     sym = nnvm::Symbol::CreateGroup(out_syms);
   }
+  ret->sym = sym;
 
   // load the parameters
   std::unordered_map<std::string, NDArray> arg_params, aux_params;
@@ -214,6 +215,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
   }
 
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  ret->ctx = ctx;
 
   std::vector<NDArray> arg_arrays, aux_arrays;
   for (size_t i = 0; i < arg_shapes.size(); ++i) {
@@ -231,6 +233,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
     aux_arrays.push_back(nd);
   }
   ret->arg_arrays = arg_arrays;
+  ret->aux_arrays = aux_arrays;
   // bind
   {
     std::map<std::string, Context> ctx_map;
@@ -309,7 +312,6 @@ int MXPredReshape(mx_uint num_input_nodes,
         << " shape has been changed, only allow to change the shape of input data.";
     }
   }
-  p->arg_arrays.clear();
 
   for (size_t i=0; i < aux_names.size(); ++i) {
     TShape newShape = aux_shapes[i];
@@ -319,7 +321,6 @@ int MXPredReshape(mx_uint num_input_nodes,
       << " shape has been changed, only allow to change the shape of input data.";
   }
   ret->aux_arrays = p->aux_arrays;
-  p->aux_arrays.clear();
 
   // bind
   {
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 816599b955c1..fbe544221a35 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -24,10 +24,14 @@
 #ifndef MXNET_COMMON_EXEC_UTILS_H_
 #define MXNET_COMMON_EXEC_UTILS_H_
 
+#include <nnvm/graph.h>
+#include <nnvm/pass_functions.h>
+#include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include "../common/utils.h"
+#include "../executor/exec_pass.h"
 
 namespace mxnet {
 namespace common {
@@ -366,6 +370,257 @@ inline void LogInferStorage(const nnvm::Graph& g) {
   }
 }
 
+// prints a helpful message after shape inference errors in executor.
+inline void HandleInferShapeError(const size_t num_forward_inputs,
+                                  const nnvm::IndexedGraph& idx,
+                                  const nnvm::ShapeVector& inferred_shapes) {
+  int cnt = 10;
+  std::ostringstream oss;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const TShape& inferred_shape = inferred_shapes[eid];
+    if (inferred_shape.ndim() == 0 || inferred_shape.Size() == 0U) {
+      const std::string& arg_name = idx[nid].source->attrs.name;
+      oss << arg_name << ": " << inferred_shape << ", ";
+      if (--cnt == 0) {
+        oss << "...";
+        break;
+      }
+    }
+  }
+  LOG(FATAL) << "InferShape pass cannot decide shapes for the following arguments "
+                "(0s means unknown dimensions). Please consider providing them as inputs:\n"
+             << oss.str();
+}
+
+// prints a helpful message after type inference errors in executor.
+inline void HandleInferTypeError(const size_t num_forward_inputs,
+                                 const nnvm::IndexedGraph& idx,
+                                 const nnvm::DTypeVector& inferred_dtypes) {
+  int cnt = 10;
+  std::ostringstream oss;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const int inferred_dtype = inferred_dtypes[eid];
+    if (inferred_dtype == -1) {
+      const std::string& arg_name = idx[nid].source->attrs.name;
+      oss << arg_name << ": " << inferred_dtype << ", ";
+      if (--cnt == 0) {
+        oss << "...";
+        break;
+      }
+    }
+  }
+  LOG(FATAL) << "InferType pass cannot decide dtypes for the following arguments "
+                "(-1 means unknown dtype). Please consider providing them as inputs:\n"
+             << oss.str();
+}
+
+// prints a helpful message after storage type checking errors in executor.
+inline void HandleInferStorageTypeError(const size_t num_forward_inputs,
+                                        const nnvm::IndexedGraph& idx,
+                                        const StorageTypeVector& inferred_stypes) {
+  int cnt = 10;
+  std::ostringstream oss;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const int inferred_stype = inferred_stypes[eid];
+    if (inferred_stype == -1) {
+      const std::string& arg_name = idx[nid].source->attrs.name;
+      oss << arg_name << ": " << common::stype_string(inferred_stype) << ", ";
+      if (--cnt == 0) {
+        oss << "...";
+        break;
+      }
+    }
+  }
+  LOG(FATAL) << "InferStorageType pass cannot decide storage type for the following arguments "
+                "(-1 means unknown stype). Please consider providing them as inputs:\n"
+             << oss.str();
+}
+
+/*!
+ * \brief If the requested ndarray's shape size is less than
+ * the corresponding shared_data_array's shape size and the
+ * storage type is shareable, reuse the memory allocation
+ * in shared_buffer; otherwise, create a zero ndarray.
+ * Shareable storages include both default storage and row_sparse storage
+ * if enable_row_sparse_sharing is `True`, otherwise default storage only.
+ */
+inline NDArray ReshapeOrCreate(const std::string& name,
+                               const TShape& dest_arg_shape,
+                               const int dest_arg_dtype,
+                               const NDArrayStorageType dest_arg_stype,
+                               const Context& ctx,
+                               std::unordered_map<std::string, NDArray>* shared_buffer,
+                               bool enable_row_sparse_sharing) {
+  bool stype_shareable = dest_arg_stype == kDefaultStorage;
+  if (enable_row_sparse_sharing) {
+    stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage;
+  }
+  auto it = shared_buffer->find(name);
+  if (it != shared_buffer->end()) {
+    // check if size is large enough for sharing
+    bool size_shareable = it->second.shape().Size() >= dest_arg_shape.Size();
+    if (size_shareable && stype_shareable) {  // memory can be reused
+      CHECK_EQ(it->second.dtype(), dest_arg_dtype)
+          << "Requested arg array's dtype does not match that of the reusable ndarray";
+      CHECK_EQ(it->second.storage_type(), dest_arg_stype)
+          << "Requested arg array's stype does not match that of the reusable ndarray";
+      return it->second.Reshape(dest_arg_shape);
+    } else if (stype_shareable) {
+      LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape
+                   << ", which is larger than already allocated shape " << it->second.shape()
+                   << ". Need to re-allocate. Consider putting default bucket key to be "
+                   << "the bucket taking the largest input for better memory sharing.";
+      // size is not large enough, creating a larger one for sharing
+      // the NDArrays in shared_buffer are guaranteed to be of shareable storages
+      it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+      return it->second;
+    } else {
+      // not shareable storage
+      return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+    }
+  } else {
+    auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+    if (stype_shareable) {
+      shared_buffer->emplace(name, ret);
+    }
+    return ret;
+  }  // if (it != shared_buffer->end())
+}
+
+/*!
+ * \brief Assign context to the graph.
+ * This is triggered by both simple_bind and bind flows.
+ */
+inline nnvm::Graph AssignContext(nnvm::Graph g,
+                                 const Context& default_ctx,
+                                 const std::map<std::string, Context>& ctx_map,
+                                 const std::vector<Context>& in_arg_ctxes,
+                                 const std::vector<Context>& arg_grad_ctxes,
+                                 const std::vector<Context>& aux_state_ctxes,
+                                 const std::vector<OpReqType>& grad_req_types,
+                                 size_t num_forward_inputs,
+                                 size_t num_forward_outputs) {
+  const auto& idx = g.indexed_graph();
+  const auto& mutable_nodes = idx.mutable_input_nodes();
+  // default use default context.
+  if (ctx_map.size() == 0) {
+    g.attrs["context"] = std::make_shared<nnvm::any>(
+        exec::ContextVector(idx.num_nodes(), default_ctx));
+    for (const auto& x : in_arg_ctxes) {
+      CHECK(x == default_ctx)
+          << "Input array is in " << x << " while binding with ctx=" << default_ctx
+          << ". All arguments must be in global context (" << default_ctx
+          << ") unless group2ctx is specified for cross-device graph.";
+    }
+    for (const auto& x : arg_grad_ctxes) {
+      CHECK(x == default_ctx)
+          << "Gradient array is in " << x << " while binding with ctx="
+          << default_ctx << ". All gradients must be in global context (" << default_ctx
+          << ") unless group2ctx is specified for cross-device graph.";
+    }
+    return g;
+  }
+
+  // otherwise, use context assignment.
+  std::map<Context, int> ctx2id;  // map ctx to device id
+  std::vector<Context> ctx_list;  // index is device id
+  nnvm::DeviceVector device(idx.num_nodes(), -1);  // index is node id
+  nnvm::DeviceAssignMap device_map;  // map arg name to device id
+
+  // loop through the user input ctx_map and
+  // populate maps and lists
+  for (auto &kv : ctx_map) {
+    if (ctx2id.count(kv.second) == 0) {  // if context has no device id, create one
+      ctx2id[kv.second] = static_cast<int>(ctx_list.size());  // assign device id to ctx
+      ctx_list.push_back(kv.second);  // save ctx to the list
+    }
+    // assign device id to to the arg name with the corresponding ctx
+    device_map[kv.first] = ctx2id.at(kv.second);
+  }
+
+  // loop through all the rest of input nodes not specified
+  // in the ctx_map and populate maps and lists
+  size_t arg_top = 0, aux_top = 0;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    Context ctx;
+    if (mutable_nodes.count(nid)) {  // aux node is mutable
+      CHECK_LT(aux_top, aux_state_ctxes.size());
+      ctx = aux_state_ctxes[aux_top];
+      ++aux_top;
+    } else {  // regular input node is immutable
+      CHECK_LT(arg_top, in_arg_ctxes.size());
+      ctx = in_arg_ctxes[arg_top];
+      ++arg_top;
+    }
+    if (ctx2id.count(ctx) == 0) {  // if the current ctx is not in the map of ctx and device id
+      ctx2id[ctx] = static_cast<int>(ctx_list.size());  // assign the current ctx with device id
+      ctx_list.push_back(ctx);  // save the current ctx in the list
+    }
+    device[nid] = ctx2id.at(ctx);  // assign device id to the current node
+  }
+
+  // loop through backward input nodes and populate maps and lists
+  // the backward input nodes is the gradient of the loss wrt the output
+  size_t arg_grad_offset = 0;
+  // keep an offset into the arg_grad_ctxes vector,
+  // since g.outputs exclude arg_grad whose req == null
+  CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs)
+      << "insufficient number of grad_reqs";
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
+    const uint32_t nid = idx.outputs()[i].node_id;
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
+    if (ctx2id.count(ctx) == 0) {
+      ctx2id[ctx] = static_cast<int>(ctx_list.size());
+      ctx_list.push_back(ctx);
+    }
+    int devid = ctx2id.at(ctx);
+    if (device[nid] != -1) {
+      CHECK_EQ(device[nid], devid) << "device of same output not equal to each other";
+    } else {
+      device[nid] = devid;
+    }
+  }
+
+  g.attrs["device"] = std::make_shared<dmlc::any>(std::move(device));
+  g = nnvm::pass::PlaceDevice(g, "__ctx_group__", device_map, "_CrossDeviceCopy");
+  const auto& assigned_device = g.GetAttr<nnvm::DeviceVector>("device");
+
+  exec::ContextVector vcontext;
+  for (size_t i = 0; i < assigned_device.size(); ++i) {
+    if (assigned_device[i] == -1) {
+      vcontext.push_back(default_ctx);
+    } else {
+      vcontext.push_back(ctx_list[assigned_device[i]]);
+    }
+  }
+
+  // after device planning, we should check again
+  // if the assigned device of gradient node
+  // corresponds to storage of grads
+  auto &new_idx = g.indexed_graph();
+  arg_grad_offset = 0;
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
+    const uint32_t nid = new_idx.outputs()[i].node_id;
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
+    CHECK(ctx == vcontext[nid])
+        << "Trying to save gradient to " << ctx
+        << " while its source node \"" << new_idx[nid].source->attrs.name
+        << "\" computes it on " << vcontext[nid]
+        << ". Check your ctx in NDArray allocation.";
+  }
+
+  g.attrs["context"] = std::make_shared<nnvm::any>(std::move(vcontext));
+  return g;
+}
 
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/serialization.h b/src/common/serialization.h
new file mode 100644
index 000000000000..8a1bcc6e6ed2
--- /dev/null
+++ b/src/common/serialization.h
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file serialization.h
+ * \brief Serialization of some STL and nnvm data-structures
+ * \author Clement Fuji Tsang
+ */
+
+#ifndef MXNET_COMMON_SERIALIZATION_H_
+#define MXNET_COMMON_SERIALIZATION_H_
+
+#include <dmlc/logging.h>
+#include <mxnet/graph_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+
+#include <cstring>
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+
+namespace mxnet {
+namespace common {
+
+template<typename T>
+inline size_t SerializedSize(const T &obj);
+
+template<typename T>
+inline size_t SerializedSize(const nnvm::Tuple <T> &obj);
+
+template<typename K, typename V>
+inline size_t SerializedSize(const std::map <K, V> &obj);
+
+template<>
+inline size_t SerializedSize(const std::string &obj);
+
+template<typename... Args>
+inline size_t SerializedSize(const std::tuple<Args...> &obj);
+
+template<typename T>
+inline void Serialize(const T &obj, char **buffer);
+
+template<typename T>
+inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer);
+
+template<typename K, typename V>
+inline void Serialize(const std::map <K, V> &obj, char **buffer);
+
+template<>
+inline void Serialize(const std::string &obj, char **buffer);
+
+template<typename... Args>
+inline void Serialize(const std::tuple<Args...> &obj, char **buffer);
+
+template<typename T>
+inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos);
+
+template<typename T>
+inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
+
+template<typename K, typename V>
+inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos);
+
+template<>
+inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos);
+
+template<typename... Args>
+inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos);
+
+
+template<typename T>
+struct is_container {
+  static const bool value = !std::is_pod<T>::value;
+};
+
+template<typename T>
+inline size_t SerializedSize(const T &obj) {
+  return sizeof(T);
+}
+
+template<typename T>
+inline size_t SerializedSize(const nnvm::Tuple <T> &obj) {
+  if (is_container<T>::value) {
+    size_t sum_val = 4;
+    for (const auto& el : obj) {
+      sum_val += SerializedSize(el);
+    }
+    return sum_val;
+  } else {
+    return 4 + (obj.ndim() * sizeof(T));
+  }
+}
+
+template<typename K, typename V>
+inline size_t SerializedSize(const std::map <K, V> &obj) {
+  size_t sum_val = 4;
+  if (is_container<K>::value && is_container<V>::value) {
+    for (const auto& p : obj) {
+      sum_val += SerializedSize(p.first) + SerializedSize(p.second);
+    }
+  } else if (is_container<K>::value) {
+    for (const auto& p : obj) {
+      sum_val += SerializedSize(p.first);
+    }
+    sum_val += sizeof(V) * obj.size();
+  } else if (is_container<V>::value) {
+    for (const auto& p : obj) {
+      sum_val += SerializedSize(p.second);
+    }
+    sum_val += sizeof(K) * obj.size();
+  } else {
+    sum_val += (sizeof(K) + sizeof(V)) * obj.size();
+  }
+  return sum_val;
+}
+
+template<>
+inline size_t SerializedSize(const std::string &obj) {
+  return obj.size() + 4;
+}
+
+template<int I>
+struct serialized_size_tuple {
+  template<typename... Args>
+  static inline size_t Compute(const std::tuple<Args...> &obj) {
+    return SerializedSize(std::get<I>(obj)) + serialized_size_tuple<I-1>::Compute(obj);
+  }
+};
+
+template<>
+struct serialized_size_tuple<0> {
+  template<typename... Args>
+  static inline size_t Compute(const std::tuple<Args...> &obj) {
+    return SerializedSize(std::get<0>(obj));
+  }
+};
+
+template<typename... Args>
+inline size_t SerializedSize(const std::tuple<Args...> &obj) {
+  return serialized_size_tuple<sizeof... (Args)-1>::Compute(obj);
+}
+
+//  Serializer
+
+template<typename T>
+inline size_t SerializedContainerSize(const T &obj, char **buffer) {
+  uint32_t size = obj.size();
+  std::memcpy(*buffer, &size, 4);
+  *buffer += 4;
+  return (size_t) size;
+}
+
+template<typename T>
+inline void Serialize(const T &obj, char **buffer) {
+  std::memcpy(*buffer, &obj, sizeof(T));
+  *buffer += sizeof(T);
+}
+
+template<typename T>
+inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer) {
+  uint32_t size = obj.ndim();
+  std::memcpy(*buffer, &size, 4);
+  *buffer += 4;
+  for (auto& el : obj) {
+    Serialize(el, buffer);
+  }
+}
+
+template<typename K, typename V>
+inline void Serialize(const std::map <K, V> &obj, char **buffer) {
+  SerializedContainerSize(obj, buffer);
+  for (auto& p : obj) {
+    Serialize(p.first, buffer);
+    Serialize(p.second, buffer);
+  }
+}
+
+template<>
+inline void Serialize(const std::string &obj, char **buffer) {
+  auto size = SerializedContainerSize(obj, buffer);
+  std::memcpy(*buffer, &obj[0], size);
+  *buffer += size;
+}
+
+template<int I>
+struct serialize_tuple {
+  template<typename... Args>
+  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
+    serialize_tuple<I-1>::Compute(obj, buffer);
+    Serialize(std::get<I>(obj), buffer);
+  }
+};
+
+template<>
+struct serialize_tuple<0> {
+  template<typename... Args>
+  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
+    Serialize(std::get<0>(obj), buffer);
+  }
+};
+
+template<typename... Args>
+inline void Serialize(const std::tuple<Args...> &obj, char **buffer) {
+  serialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer);
+}
+
+// Deserializer
+
+template<typename T>
+inline size_t DeserializedContainerSize(T *obj, const std::string &buffer, size_t *curr_pos) {
+  uint32_t size = obj->size();
+  std::memcpy(&size, &buffer[*curr_pos], 4);
+  *curr_pos += 4;
+  return (size_t) size;
+}
+
+template<typename T>
+inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos) {
+  std::memcpy(obj, &buffer[*curr_pos], sizeof(T));
+  *curr_pos += sizeof(T);
+}
+
+template<typename T>
+inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
+  uint32_t size = obj->ndim();
+  std::memcpy(&size, &buffer[*curr_pos], 4);
+  *curr_pos += 4;
+  obj->SetDim(size);
+  for (size_t i = 0; i < size; ++i) {
+    Deserialize((*obj)[i], buffer, curr_pos);
+  }
+}
+
+template<typename K, typename V>
+inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos) {
+  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
+  K first;
+  for (size_t i = 0; i < size; ++i) {
+    Deserialize(&first, buffer, curr_pos);
+    Deserialize(&(*obj)[first], buffer, curr_pos);
+  }
+}
+
+template<>
+inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos) {
+  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
+  obj->resize(size);
+  std::memcpy(&(obj->front()), &buffer[*curr_pos], size);
+  *curr_pos += size;
+}
+
+template<int I>
+struct deserialize_tuple {
+  template<typename... Args>
+  static inline void Compute(std::tuple<Args...> *obj,
+                             const std::string &buffer, size_t *curr_pos) {
+    deserialize_tuple<I-1>::Compute(obj, buffer, curr_pos);
+    Deserialize(&std::get<I>(*obj), buffer, curr_pos);
+  }
+};
+
+template<>
+struct deserialize_tuple<0> {
+  template<typename... Args>
+  static inline void Compute(std::tuple<Args...> *obj,
+                             const std::string &buffer, size_t *curr_pos) {
+    Deserialize(&std::get<0>(*obj), buffer, curr_pos);
+  }
+};
+
+template<typename... Args>
+inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos) {
+  deserialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer, curr_pos);
+}
+
+
+template<typename T>
+inline void Serialize(const T& obj, std::string* serialized_data) {
+  serialized_data->resize(SerializedSize(obj));
+  char* curr_pos = &(serialized_data->front());
+  Serialize(obj, &curr_pos);
+  CHECK_EQ((int64_t)curr_pos - (int64_t)&(serialized_data->front()),
+           serialized_data->size());
+}
+
+template<typename T>
+inline void Deserialize(T* obj, const std::string& serialized_data) {
+  size_t curr_pos = 0;
+  Deserialize(obj, serialized_data, &curr_pos);
+  CHECK_EQ(curr_pos, serialized_data.size());
+}
+
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_SERIALIZATION_H_
diff --git a/src/common/utils.h b/src/common/utils.h
index 96949a047fba..fcc3da82b051 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -675,6 +675,37 @@ MSHADOW_XINLINE int ilog2ui(unsigned int a) {
   return k;
 }
 
+/*!
+ * \brief Return an NDArray of all zeros.
+ */
+inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape,
+                         const Context &ctx, const int dtype) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    NDArray ret(shape, ctx, false, dtype);
+    ret = 0;
+    return ret;
+  }
+  // NDArray with non-default storage. Storage allocation is always delayed.
+  return NDArray(stype, shape, ctx, true, dtype);
+}
+
+/*!
+ * \brief Helper to add a NDArray of zeros to a std::vector.
+ */
+inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape,
+                             const Context &ctx, const int dtype,
+                             std::vector<NDArray> *vec) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    vec->emplace_back(shape, ctx, false, dtype);
+    vec->back() = 0;
+  } else {
+    // NDArray with non-default storage. Storage allocation is always delayed.
+    vec->emplace_back(stype, shape, ctx, true, dtype);
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 26a249118940..8c483e9b2b8e 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -198,6 +198,18 @@ Graph InferStorageType(Graph&& graph,
                        StorageTypeVector&& storage_type_inputs = StorageTypeVector(),
                        const std::string& storage_type_attr_key = "");
 
+#if MXNET_USE_TENSORRT
+/*!
+ * \brief Replace subgraphs by TRT (forward only)
+ */
+Graph ReplaceSubgraph(Graph&& g,
+                      const std::unordered_set<nnvm::Node*>& set_subgraph,
+                      std::unordered_map<std::string, NDArray>* const params_map);
+
+std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
+    std::unordered_map<std::string, NDArray>* const params_map);
+#endif
+
 }  // namespace exec
 }  // namespace mxnet
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 33c6f574a044..0e8070670904 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -37,6 +37,8 @@
 namespace mxnet {
 namespace exec {
 
+using namespace mxnet::common;
+
 GraphExecutor::GraphExecutor() {
   log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
   need_grad_ = false;
@@ -56,30 +58,6 @@ GraphExecutor::~GraphExecutor() {
   }
 }
 
-inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape,
-                                const Context &ctx, const int dtype) {
-  // NDArray with default storage
-  if (stype == kDefaultStorage) {
-    NDArray ret(shape, ctx, false, dtype);
-    ret = 0;
-    return ret;
-  }
-  // NDArray with non-default storage. Storage allocation is always delayed.
-  return NDArray(stype, shape, ctx, true, dtype);
-}
-
-inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape,
-                             const Context &ctx, const int dtype,
-                             std::vector<NDArray> *vec) {
-  // NDArray with default storage
-  if (stype == kDefaultStorage) {
-    vec->emplace_back(shape, ctx, false, dtype);
-    vec->back() = 0;
-  } else {
-    // NDArray with non-default storage. Storage allocation is always delayed.
-    vec->emplace_back(stype, shape, ctx, true, dtype);
-  }
-}
 void GraphExecutor::Forward(bool is_train) {
   RunOps(is_train, 0, num_forward_nodes_);
 }
@@ -308,204 +286,6 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   return g;
 }
 
-/*!
- * \brief Assign context to the graph.
- * This is triggered by both simple_bind and bind flows.
- */
-static Graph AssignContext(Graph g,
-                    const Context& default_ctx,
-                    const std::map<std::string, Context>& ctx_map,
-                    const std::vector<Context>& in_arg_ctxes,
-                    const std::vector<Context>& arg_grad_ctxes,
-                    const std::vector<Context>& aux_state_ctxes,
-                    const std::vector<OpReqType>& grad_req_types,
-                    size_t num_forward_inputs,
-                    size_t num_forward_outputs) {
-  const auto& idx = g.indexed_graph();
-  const auto& mutable_nodes = idx.mutable_input_nodes();
-  // default use default context.
-  if (ctx_map.size() == 0) {
-    g.attrs["context"] = std::make_shared<nnvm::any>(
-        ContextVector(idx.num_nodes(), default_ctx));
-    for (const auto& x : in_arg_ctxes) {
-      CHECK(x == default_ctx)
-        << "Input array is in " << x << " while binding with ctx=" << default_ctx
-        << ". All arguments must be in global context (" << default_ctx
-        << ") unless group2ctx is specified for cross-device graph.";
-    }
-    for (const auto& x : arg_grad_ctxes) {
-      CHECK(x == default_ctx)
-        << "Gradient array is in " << x << " while binding with ctx="
-        << default_ctx << ". All gradients must be in global context (" << default_ctx
-        << ") unless group2ctx is specified for cross-device graph.";
-    }
-    return g;
-  }
-
-  // otherwise, use context assignment.
-  std::map<Context, int> ctx2id;  // map ctx to device id
-  std::vector<Context> ctx_list;  // index is device id
-  nnvm::DeviceVector device(idx.num_nodes(), -1);  // index is node id
-  nnvm::DeviceAssignMap device_map;  // map arg name to device id
-
-  // loop through the user input ctx_map and
-  // populate maps and lists
-  for (auto &kv : ctx_map) {
-    if (ctx2id.count(kv.second) == 0) {  // if context has no device id, create one
-      ctx2id[kv.second] = static_cast<int>(ctx_list.size());  // assign device id to ctx
-      ctx_list.push_back(kv.second);  // save ctx to the list
-    }
-    // assign device id to to the arg name with the corresponding ctx
-    device_map[kv.first] = ctx2id.at(kv.second);
-  }
-
-  // loop through all the rest of input nodes not specified
-  // in the ctx_map and populate maps and lists
-  size_t arg_top = 0, aux_top = 0;
-  for (size_t i = 0; i < num_forward_inputs; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    Context ctx;
-    if (mutable_nodes.count(nid)) {  // aux node is mutable
-      CHECK_LT(aux_top, aux_state_ctxes.size());
-      ctx = aux_state_ctxes[aux_top];
-      ++aux_top;
-    } else {  // regular input node is immutable
-      CHECK_LT(arg_top, in_arg_ctxes.size());
-      ctx = in_arg_ctxes[arg_top];
-      ++arg_top;
-    }
-    if (ctx2id.count(ctx) == 0) {  // if the current ctx is not in the map of ctx and device id
-      ctx2id[ctx] = static_cast<int>(ctx_list.size());  // assign the current ctx with device id
-      ctx_list.push_back(ctx);  // save the current ctx in the list
-    }
-    device[nid] = ctx2id.at(ctx);  // assign device id to the current node
-  }
-
-  // loop through backward input nodes and populate maps and lists
-  // the backward input nodes is the gradient of the loss wrt the output
-  size_t arg_grad_offset = 0;
-  // keep an offset into the arg_grad_ctxes vector,
-  // since g.outputs exclude arg_grad whose req == null
-  CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs)
-           << "insufficient number of grad_reqs";
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
-    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
-    const uint32_t nid = idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[arg_grad_offset];
-    if (ctx2id.count(ctx) == 0) {
-      ctx2id[ctx] = static_cast<int>(ctx_list.size());
-      ctx_list.push_back(ctx);
-    }
-    int devid = ctx2id.at(ctx);
-    if (device[nid] != -1) {
-      CHECK_EQ(device[nid], devid) << "device of same output not equal to each other";
-    } else {
-      device[nid] = devid;
-    }
-  }
-
-  g.attrs["device"] = std::make_shared<dmlc::any>(std::move(device));
-  g = nnvm::pass::PlaceDevice(g, "__ctx_group__", device_map, "_CrossDeviceCopy");
-  const auto& assigned_device = g.GetAttr<nnvm::DeviceVector>("device");
-
-  ContextVector vcontext;
-  for (size_t i = 0; i < assigned_device.size(); ++i) {
-    if (assigned_device[i] == -1) {
-      vcontext.push_back(default_ctx);
-    } else {
-      vcontext.push_back(ctx_list[assigned_device[i]]);
-    }
-  }
-
-  // after device planning, we should check again
-  // if the assigned device of gradient node
-  // corresponds to storage of grads
-  auto &new_idx = g.indexed_graph();
-  arg_grad_offset = 0;
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
-    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
-    const uint32_t nid = new_idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[arg_grad_offset];
-    CHECK(ctx == vcontext[nid])
-      << "Trying to save gradient to " << ctx
-      << " while its source node \"" << new_idx[nid].source->attrs.name
-      << "\" computes it on " << vcontext[nid]
-      << ". Check your ctx in NDArray allocation.";
-  }
-
-  g.attrs["context"] = std::make_shared<nnvm::any>(std::move(vcontext));
-  return g;
-}
-
-static void HandleInferShapeError(const size_t num_forward_inputs,
-                           const nnvm::IndexedGraph& idx,
-                           const nnvm::ShapeVector& inferred_shapes) {
-  int cnt = 10;
-  std::ostringstream oss;
-  for (size_t i = 0; i < num_forward_inputs; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const uint32_t eid = idx.entry_id(nid, 0);
-    const TShape& inferred_shape = inferred_shapes[eid];
-    if (inferred_shape.ndim() == 0 || inferred_shape.Size() == 0U) {
-      const std::string& arg_name = idx[nid].source->attrs.name;
-      oss << arg_name << ": " << inferred_shape << ", ";
-      if (--cnt == 0) {
-        oss << "...";
-        break;
-      }
-    }
-  }
-  LOG(FATAL) << "InferShape pass cannot decide shapes for the following arguments "
-                "(0s means unknown dimensions). Please consider providing them as inputs:\n"
-             << oss.str();
-}
-
-static void HandleInferTypeError(const size_t num_forward_inputs,
-                          const nnvm::IndexedGraph& idx,
-                          const nnvm::DTypeVector& inferred_dtypes) {
-  int cnt = 10;
-  std::ostringstream oss;
-  for (size_t i = 0; i < num_forward_inputs; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const uint32_t eid = idx.entry_id(nid, 0);
-    const int inferred_dtype = inferred_dtypes[eid];
-    if (inferred_dtype == -1) {
-      const std::string& arg_name = idx[nid].source->attrs.name;
-      oss << arg_name << ": " << inferred_dtype << ", ";
-      if (--cnt == 0) {
-        oss << "...";
-        break;
-      }
-    }
-  }
-  LOG(FATAL) << "InferType pass cannot decide dtypes for the following arguments "
-                "(-1 means unknown dtype). Please consider providing them as inputs:\n"
-             << oss.str();
-}
-
-static void HandleInferStorageTypeError(const size_t num_forward_inputs,
-                                 const nnvm::IndexedGraph& idx,
-                                 const StorageTypeVector& inferred_stypes) {
-  int cnt = 10;
-  std::ostringstream oss;
-  for (size_t i = 0; i < num_forward_inputs; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const uint32_t eid = idx.entry_id(nid, 0);
-    const int inferred_stype = inferred_stypes[eid];
-    if (inferred_stype == -1) {
-      const std::string& arg_name = idx[nid].source->attrs.name;
-      oss << arg_name << ": " << common::stype_string(inferred_stype) << ", ";
-      if (--cnt == 0) {
-        oss << "...";
-        break;
-      }
-    }
-  }
-  LOG(FATAL) << "InferStorageType pass cannot decide storage type for the following arguments "
-                "(-1 means unknown stype). Please consider providing them as inputs:\n"
-             << oss.str();
-}
-
 /*!
  * \brief GraphExecutor initializer for regular bind flow in which
  * input arguments and gradients are provided by users. This initializer
@@ -680,57 +460,6 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
   }
 }
 
-/*!
- * \brief If the requested ndarray's shape size is less than
- * the corresponding shared_data_array's shape size and the
- * storage type is shareable, reuse the memory allocation
- * in shared_buffer; otherwise, create a zero ndarray.
- * Shareable storages include both default storage and row_sparse storage
- * if enable_row_sparse_sharing is `True`, otherwise default storage only.
- */
-static NDArray ReshapeOrCreate(const std::string& name,
-                        const TShape& dest_arg_shape,
-                        const int dest_arg_dtype,
-                        const NDArrayStorageType dest_arg_stype,
-                        const Context& ctx,
-                        std::unordered_map<std::string, NDArray>* shared_buffer,
-                        bool enable_row_sparse_sharing) {
-  bool stype_shareable = dest_arg_stype == kDefaultStorage;
-  if (enable_row_sparse_sharing) {
-    stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage;
-  }
-  auto it = shared_buffer->find(name);
-  if (it != shared_buffer->end()) {
-    // check if size is large enough for sharing
-    bool size_shareable = it->second.shape().Size() >= dest_arg_shape.Size();
-    if (size_shareable && stype_shareable) {  // memory can be reused
-      CHECK_EQ(it->second.dtype(), dest_arg_dtype)
-        << "Requested arg array's dtype does not match that of the reusable ndarray";
-      CHECK_EQ(it->second.storage_type(), dest_arg_stype)
-        << "Requested arg array's stype does not match that of the reusable ndarray";
-      return it->second.Reshape(dest_arg_shape);
-    } else if (stype_shareable) {
-      LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape
-                   << ", which is larger than already allocated shape " << it->second.shape()
-                   << ". Need to re-allocate. Consider putting default bucket key to be "
-                   << "the bucket taking the largest input for better memory sharing.";
-      // size is not large enough, creating a larger one for sharing
-      // the NDArrays in shared_buffer are guaranteed to be of shareable storages
-      it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
-      return it->second;
-    } else {
-      // not shareable storage
-      return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
-    }
-  } else {
-    auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
-    if (stype_shareable) {
-      shared_buffer->emplace(name, ret);
-    }
-    return ret;
-  }  // if (it != shared_buffer->end())
-}
-
 /*!
  * \brief Initialize in_args, arg_grads, and aux_states
  * and their data_entry_ of the executor using
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index bfc415b4526a..7b936c300254 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -163,20 +163,21 @@ class GraphExecutor : public Executor {
                      std::vector<NDArray>* aux_state_vec);
   // Initialize in_args, arg_grads and aux_states with
   // shared_buffer and shared_exec
-  void InitArguments(const nnvm::IndexedGraph& idx,
-                     const nnvm::ShapeVector& inferred_shapes,
-                     const nnvm::DTypeVector& inferred_dtypes,
-                     const StorageTypeVector& inferred_stypes,
-                     const std::vector<Context>& in_arg_ctxes,
-                     const std::vector<Context>& arg_grad_ctxes,
-                     const std::vector<Context>& aux_state_ctxes,
-                     const std::vector<OpReqType>& grad_req_types,
-                     const std::unordered_set<std::string>& shared_arg_names,
-                     const Executor* shared_exec,
-                     std::unordered_map<std::string, NDArray>* shared_buffer,
-                     std::vector<NDArray>* in_arg_vec,
-                     std::vector<NDArray>* arg_grad_vec,
-                     std::vector<NDArray>* aux_state_vec);
+  virtual void InitArguments(const nnvm::IndexedGraph& idx,
+                             const nnvm::ShapeVector& inferred_shapes,
+                             const nnvm::DTypeVector& inferred_dtypes,
+                             const StorageTypeVector& inferred_stypes,
+                             const std::vector<Context>& in_arg_ctxes,
+                             const std::vector<Context>& arg_grad_ctxes,
+                             const std::vector<Context>& aux_state_ctxes,
+                             const std::vector<OpReqType>& grad_req_types,
+                             const std::unordered_set<std::string>& shared_arg_names,
+                             const Executor* shared_exec,
+                             std::unordered_map<std::string, NDArray>* shared_buffer,
+                             std::vector<NDArray>* in_arg_vec,
+                             std::vector<NDArray>* arg_grad_vec,
+                             std::vector<NDArray>* aux_state_vec);
+
   // internal initialization of the graph for simple bind
   Graph InitGraph(nnvm::Symbol symbol,
                   const Context& default_ctx,
@@ -212,7 +213,6 @@ class GraphExecutor : public Executor {
   void BulkInferenceOpSegs();
   // perform bulking and segmentation on a training graph
   void BulkTrainingOpSegs(size_t total_num_nodes);
-
   // indicate whether there is a backward graph for gradients.
   bool need_grad_;
   // internal graph
diff --git a/src/executor/onnx_to_tensorrt.cc b/src/executor/onnx_to_tensorrt.cc
new file mode 100644
index 000000000000..0b4d91be7009
--- /dev/null
+++ b/src/executor/onnx_to_tensorrt.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file onnx_to_tensorrt.cc
+ * \brief TensorRT integration with the MXNet executor
+ * \author Marek Kolodziej, Clement Fuji Tsang
+ */
+
+#if MXNET_USE_TENSORRT
+
+#include "./onnx_to_tensorrt.h"
+
+#include <onnx/onnx.pb.h>
+
+#include <NvInfer.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include <onnx-tensorrt/NvOnnxParser.h>
+#include <onnx-tensorrt/NvOnnxParserRuntime.h>
+#include <onnx-tensorrt/PluginFactory.hpp>
+#include <onnx-tensorrt/plugin_common.hpp>
+
+using std::cout;
+using std::cerr;
+using std::endl;
+
+namespace onnx_to_tensorrt {
+
+struct InferDeleter {
+  template<typename T>
+    void operator()(T* obj) const {
+      if ( obj ) {
+        obj->destroy();
+      }
+    }
+};
+
+template<typename T>
+inline std::shared_ptr<T> InferObject(T* obj) {
+  if ( !obj ) {
+    throw std::runtime_error("Failed to create object");
+  }
+  return std::shared_ptr<T>(obj, InferDeleter());
+}
+
+std::string onnx_ir_version_string(int64_t ir_version = onnx::IR_VERSION) {
+  int onnx_ir_major = ir_version / 1000000;
+  int onnx_ir_minor = ir_version % 1000000 / 10000;
+  int onnx_ir_patch = ir_version % 10000;
+  return (std::to_string(onnx_ir_major) + "." +
+    std::to_string(onnx_ir_minor) + "." +
+    std::to_string(onnx_ir_patch));
+}
+
+void PrintVersion() {
+  cout << "Parser built against:" << endl;
+  cout << "  ONNX IR version:  " << onnx_ir_version_string(onnx::IR_VERSION) << endl;
+  cout << "  TensorRT version: "
+    << NV_TENSORRT_MAJOR << "."
+    << NV_TENSORRT_MINOR << "."
+    << NV_TENSORRT_PATCH << endl;
+}
+
+nvinfer1::ICudaEngine* onnxToTrtCtx(
+        const std::string& onnx_model,
+        int32_t max_batch_size,
+        size_t max_workspace_size,
+        nvinfer1::ILogger::Severity verbosity,
+        bool debug_builder) {
+  GOOGLE_PROTOBUF_VERIFY_VERSION;
+
+  TRT_Logger trt_logger(verbosity);
+  auto trt_builder = InferObject(nvinfer1::createInferBuilder(trt_logger));
+  auto trt_network = InferObject(trt_builder->createNetwork());
+  auto trt_parser  = InferObject(nvonnxparser::createParser(
+      *trt_network, trt_logger));
+  ::ONNX_NAMESPACE::ModelProto parsed_model;
+  // We check for a valid parse, but the main effect is the side effect
+  // of populating parsed_model
+  if (!parsed_model.ParseFromString(onnx_model)) {
+    throw dmlc::Error("Could not parse ONNX from string");
+  }
+
+  if ( !trt_parser->parse(onnx_model.c_str(), onnx_model.size()) ) {
+      int nerror = trt_parser->getNbErrors();
+      for ( int i=0; i < nerror; ++i ) {
+        nvonnxparser::IParserError const* error = trt_parser->getError(i);
+        if ( error->node() != -1 ) {
+          ::ONNX_NAMESPACE::NodeProto const& node =
+            parsed_model.graph().node(error->node());
+          cerr << "While parsing node number " << error->node()
+               << " [" << node.op_type();
+          if ( !node.output().empty() ) {
+            cerr << " -> \"" << node.output(0) << "\"";
+          }
+          cerr << "]:" << endl;
+          if ( static_cast<int>(verbosity) >= \
+            static_cast<int>(nvinfer1::ILogger::Severity::kINFO) ) {
+            cerr << "--- Begin node ---" << endl;
+            cerr << node.DebugString() << endl;
+            cerr << "--- End node ---" << endl;
+          }
+        }
+        cerr << "ERROR: "
+             << error->file() << ":" << error->line()
+             << " In function " << error->func() << ":\n"
+             << "[" << static_cast<int>(error->code()) << "] " << error->desc()
+             << endl;
+      }
+      throw dmlc::Error("Cannot parse ONNX into TensorRT Engine");
+  }
+
+  bool fp16 = trt_builder->platformHasFastFp16();
+
+  trt_builder->setMaxBatchSize(max_batch_size);
+  trt_builder->setMaxWorkspaceSize(max_workspace_size);
+  if ( fp16 && dmlc::GetEnv("MXNET_TENSORRT_USE_FP16_FOR_FP32", false) ) {
+    LOG(INFO) << "WARNING: TensorRT using fp16 given original MXNet graph in fp32 !!!";
+    trt_builder->setHalf2Mode(true);
+  }
+
+  trt_builder->setDebugSync(debug_builder);
+  nvinfer1::ICudaEngine* trt_engine = trt_builder->buildCudaEngine(*trt_network.get());
+  return trt_engine;
+}
+
+}  // namespace onnx_to_tensorrt
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/onnx_to_tensorrt.h b/src/executor/onnx_to_tensorrt.h
new file mode 100644
index 000000000000..259cfce7c332
--- /dev/null
+++ b/src/executor/onnx_to_tensorrt.h
@@ -0,0 +1,77 @@
+#ifndef MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+#define MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file onnx_to_tensorrt.h
+ * \brief TensorRT integration with the MXNet executor
+ * \author Marek Kolodziej, Clement Fuji Tsang
+ */
+
+#if MXNET_USE_TENSORRT
+
+#include <fstream>
+#include <iostream>
+#include <NvInfer.h>
+#include <sstream>
+#include <string>
+
+#include "../operator/contrib/tensorrt-inl.h"
+
+namespace onnx_to_tensorrt {
+
+class TRT_Logger : public nvinfer1::ILogger {
+        nvinfer1::ILogger::Severity _verbosity;
+        std::ostream* _ostream;
+ public:
+        TRT_Logger(Severity verbosity = Severity::kWARNING,
+                   std::ostream& ostream = std::cout)
+                : _verbosity(verbosity), _ostream(&ostream) {}
+        void log(Severity severity, const char* msg) override {
+                if ( severity <= _verbosity ) {
+                        time_t rawtime = std::time(0);
+                        char buf[256];
+                        strftime(&buf[0], 256,
+                                 "%Y-%m-%d %H:%M:%S",
+                                 std::gmtime(&rawtime));
+                        const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" :
+                                              severity == Severity::kERROR          ? "  ERROR" :
+                                              severity == Severity::kWARNING        ? "WARNING" :
+                                              severity == Severity::kINFO           ? "   INFO" :
+                                              "UNKNOWN");
+                        (*_ostream) << "[" << buf << " " << sevstr << "] "
+                                    << msg
+                                    << std::endl;
+                }
+        }
+};
+
+nvinfer1::ICudaEngine* onnxToTrtCtx(
+        const std::string& onnx_model,
+        int32_t max_batch_size = 32,
+        size_t max_workspace_size = 1L << 30,
+        nvinfer1::ILogger::Severity verbosity = nvinfer1::ILogger::Severity::kWARNING,
+        bool debug_builder = false);
+}  // namespace onnx_to_tensorrt
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc
new file mode 100644
index 000000000000..b5fc8d15f7ac
--- /dev/null
+++ b/src/executor/tensorrt_pass.cc
@@ -0,0 +1,596 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file tensorrt_pass.cc
+ * \brief Replace TRT compatible subgraphs by TRT engines
+ * \author Clement Fuji Tsang
+ */
+
+#if MXNET_USE_TENSORRT
+
+#include <NvInfer.h>
+#include <mxnet/base.h>
+#include <mxnet/op_attr_types.h>
+#include <mxnet/operator.h>
+#include <nnvm/graph_attr_types.h>
+#include <onnx/onnx.pb.h>
+
+#include "../operator/contrib/nnvm_to_onnx-inl.h"
+#include "./exec_pass.h"
+#include "./onnx_to_tensorrt.h"
+
+namespace mxnet {
+namespace exec {
+
+using NodePtr = nnvm::NodePtr;
+
+/*!
+ * \brief Custom graph class, which will contain bi-directional nodes
+ * we need to compute DFS and reverse DFS for graph partitioning
+ */
+class BidirectionalGraph {
+ public:
+  struct Node {
+    nnvm::Node* nnvmptr;
+    std::vector<Node*> inputs;
+    std::vector<Node*> outputs;
+  };
+  std::vector<Node> nodes;
+  std::unordered_map<nnvm::Node*, uint32_t> nnvm2nid;
+  std::vector<Node*> outputs;
+  static const std::unordered_set<std::string> unconditionalTRTop;
+
+  explicit BidirectionalGraph(const Graph &g) {
+    auto& idx = g.indexed_graph();
+    auto num_nodes = idx.num_nodes();
+    nodes.reserve(num_nodes);
+    nnvm2nid.reserve(num_nodes);
+    outputs.reserve(idx.outputs().size());
+    DFSVisit(g.outputs, [this](const nnvm::NodePtr& n) {
+      BidirectionalGraph::Node new_node;
+      new_node.nnvmptr = n.get();
+      nnvm2nid[n.get()] = static_cast<uint32_t>(nodes.size());
+      nodes.emplace_back(std::move(new_node));
+    });
+    for (const auto& it : nnvm2nid) {
+      nnvm::Node* nnvmnode = it.first;
+      uint32_t nid = it.second;
+      for (auto& n : nnvmnode->inputs) {
+        uint32_t input_nid = nnvm2nid[n.node.get()];
+        nodes[input_nid].outputs.emplace_back(&nodes[nid]);
+        nodes[nid].inputs.emplace_back(&nodes[input_nid]);
+      }
+    }
+    for (auto& e : g.outputs) {
+      uint32_t nid = nnvm2nid[e.node.get()];
+      outputs.emplace_back(&nodes[nid]);
+    }
+  }
+
+  template <typename FVisit>
+  void DFS(const std::vector<Node*>& heads, bool reverse, FVisit fvisit) {
+    std::unordered_set<Node*> visited;
+    std::vector<Node*> vec(heads.begin(), heads.end());
+    visited.reserve(heads.size());
+    while (!vec.empty()) {
+      Node* vertex = vec.back();
+      vec.pop_back();
+      if (visited.count(vertex) == 0) {
+        visited.insert(vertex);
+        fvisit(vertex);
+        std::vector<Node*> nexts = reverse ? vertex->inputs : vertex->outputs;
+        for (Node* node : nexts) {
+          if (visited.count(node) == 0) {
+            vec.emplace_back(node);
+          }
+        }
+      }
+    }
+  }
+
+  using t_pairset = std::pair<std::unordered_set<Node*>, std::unordered_set<Node*>>;
+  using t_pairvec = std::pair<std::vector<Node*>, std::vector<Node*>>;
+  using t_uncomp_map = std::unordered_map<Node*, std::unordered_set<Node*>>;
+
+  std::unordered_set<Node*> naive_grow_subgraph(Node* head,
+                                                std::unordered_set<Node*>* set_unused,
+                                                t_uncomp_map* uncomp_map) {
+    std::unordered_set<Node*> subgraph;
+    std::unordered_set<Node*> uncomp_set;
+    std::deque<Node*> stack;
+    stack.emplace_back(head);
+    while (!stack.empty()) {
+      Node* vertex = stack.back();
+      stack.pop_back();
+      if (set_unused->count(vertex) && !uncomp_set.count(vertex)) {
+        set_unused->erase(vertex);
+        subgraph.insert(vertex);
+        uncomp_set.insert((*uncomp_map)[vertex].begin(), (*uncomp_map)[vertex].end());
+        for (Node* input : vertex->inputs) {
+          if (set_unused->count(input) && !uncomp_set.count(input)) {
+            stack.emplace_back(input);
+          }
+        }
+        for (Node* output : vertex->outputs) {
+          if (set_unused->count(output) && !uncomp_set.count(output)) {
+            stack.emplace_back(output);
+          }
+        }
+      }
+    }
+    return subgraph;
+  }
+
+  std::vector<std::unordered_set<Node*>> get_subsets(
+    std::unordered_map<std::string, NDArray>* const params_map) {
+    std::vector<std::unordered_set<Node*>> subgraphs;
+    std::unordered_set<Node*> set_nonTRTnodes;
+    std::unordered_set<Node*> set_allnodes(nodes.size());
+    std::vector<t_pairset> separation_sets;
+    for (Node& node : nodes) {
+      if (!IsTRTCompatible(node.nnvmptr)) {
+        set_nonTRTnodes.insert(&node);
+        std::unordered_set<Node*> in_graph;
+        std::unordered_set<Node*> out_graph;
+        std::vector<Node*> dummy_head;
+        dummy_head.emplace_back(&node);
+        DFS(dummy_head, false, [&out_graph](Node* node) {
+          out_graph.insert(node);
+        });
+        DFS(dummy_head, true, [&in_graph](Node* node) {
+          in_graph.insert(node);
+        });
+        separation_sets.emplace_back(std::make_pair(in_graph, out_graph));
+      }
+      set_allnodes.emplace(&node);
+    }
+    t_uncomp_map uncomp_map;
+    std::unordered_set<Node*> set_TRTnodes;
+    set_TRTnodes.insert(set_allnodes.begin(), set_allnodes.end());
+    for (Node* n : set_nonTRTnodes) {
+      set_TRTnodes.erase(n);
+    }
+    for (Node* n : set_TRTnodes) {
+      for (t_pairset p : separation_sets) {
+        if (p.first.count(n)) {
+          uncomp_map[n].insert(p.second.begin(), p.second.end());
+        } else if (p.second.count(n)) {
+          uncomp_map[n].insert(p.first.begin(), p.first.end());
+        }
+      }
+      for (Node* nonTRTn : set_nonTRTnodes) {
+        uncomp_map[n].erase(nonTRTn);
+      }
+    }
+    std::unordered_set<Node*> set_unused;
+    set_unused.reserve(set_TRTnodes.size());
+
+    for (auto& n : set_TRTnodes) {
+      if (n->nnvmptr->attrs.op != nullptr || params_map->count(n->nnvmptr->attrs.name)) {
+        set_unused.insert(n);
+      }
+    }
+    std::unordered_set<Node*> visited;
+    std::deque<Node*> stack(outputs.begin(), outputs.end());
+    while (!stack.empty()) {
+      Node* vertex = stack.front();
+      stack.pop_front();
+      if (!visited.count(vertex)) {
+        visited.insert(vertex);
+        if (set_unused.count(vertex)) {
+          subgraphs.emplace_back(naive_grow_subgraph(vertex, &set_unused, &uncomp_map));
+        }
+        for (Node* input : vertex->inputs) {
+          stack.emplace_back(input);
+        }
+      }
+    }
+
+    return subgraphs;
+  }
+
+
+ private:
+  friend class Graph;
+
+  bool IsTRTCompatible(nnvm::Node* nodeptr) {
+    if (nodeptr->op() == nullptr) {
+      return true;
+    }
+
+    const std::string op_name = nodeptr->op()->name;
+    if (op_name == "Pooling") {
+      return (nodeptr->attrs.dict.at("pool_type") == "avg" ||
+          nodeptr->attrs.dict.at("pool_type") == "max");
+    }
+
+    if (unconditionalTRTop.count(op_name)) {
+      return true;
+    }
+
+    if (op_name == "Activation") {
+      return nodeptr->attrs.dict.at("act_type") == "relu" ||
+        nodeptr->attrs.dict.at("act_type") == "tanh" ||
+        nodeptr->attrs.dict.at("act_type") == "sigmoid";
+    }
+
+    return false;
+  }
+};  // class BidirectionalGraph
+
+/*!
+ * \brief function which transform std::vector<dmlc::any> back to Attrs (dmlc::any)
+ */
+const std::unordered_set<std::string> BidirectionalGraph::unconditionalTRTop = {
+  "Convolution",
+  "BatchNorm",
+  "elemwise_add",
+  "elemwise_sub",
+  "elemwise_mul",
+  "rsqrt",
+  "pad",
+  "Pad",
+  "mean",
+  "FullyConnected",
+  "Flatten",
+  "SoftmaxOutput",
+};
+
+
+using NodeEntrySet = std::unordered_set<nnvm::NodeEntry, nnvm::NodeEntryHash,
+                                        nnvm::NodeEntryEqual>;
+
+/*!
+ * \brief get the output nodes of the subgraph in the main graph
+ * \return a vector of the output nodes
+*/
+std::vector<nnvm::NodeEntry> GetSubgraphNodeEntries(Graph g,
+    std::unordered_set<nnvm::Node*> set_subgraph) {
+  std::vector<nnvm::NodeEntry> outputs;
+  NodeEntrySet _outputs;
+  for (auto& e : g.outputs) {
+    if (set_subgraph.count(e.node.get())) {
+      _outputs.insert(e);
+    }
+  }
+  DFSVisit(g.outputs, [&set_subgraph, &_outputs](const nnvm::NodePtr &node){
+    if (!set_subgraph.count(node.get())) {
+      for (auto& e : node->inputs) {
+        if (set_subgraph.count(e.node.get())) {
+          _outputs.insert(e);
+        }
+      }
+    }
+  });
+  outputs.insert(outputs.begin(), _outputs.begin(), _outputs.end());
+  return outputs;
+}
+
+
+/*!
+ * \brief get the nodes outside of the subgraph for which outputs are used in the subgraph
+ * \return a vector the nodes
+*/
+std::vector<nnvm::NodeEntry> GetSubgraphInterfaceNodes(Graph g,
+    std::unordered_set<nnvm::Node*> set_subgraph) {
+  std::vector<nnvm::NodeEntry> inputs;
+  NodeEntrySet _inputs;
+  DFSVisit(g.outputs, [&set_subgraph, &_inputs](const nnvm::NodePtr &node){
+    if (set_subgraph.count(node.get())) {
+      for (auto& e : node->inputs) {
+        if (!set_subgraph.count(e.node.get())) {
+          _inputs.insert(e);
+        }
+      }
+    }
+  });
+  inputs.insert(inputs.begin(), _inputs.begin(), _inputs.end());
+  return inputs;
+}
+
+std::unordered_map<uint32_t, uint32_t> GetGraphInputsMap(const Graph& g) {
+  std::unordered_map<uint32_t, uint32_t> outputs;
+  auto& idx = g.indexed_graph();
+  outputs.reserve(idx.num_nodes());
+  std::vector<uint32_t> input_nodes = idx.input_nodes();
+  for (size_t i = 0; i < input_nodes.size(); ++i) {
+    outputs[input_nodes[i]] = static_cast<uint32_t>(i);
+  }
+  return outputs;
+}
+
+/*!
+ * \brief Dummy function which creates a fake TensorRT Node
+ */
+nnvm::NodePtr ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
+                                     std::unordered_map<std::string, NDArray>* const params_map) {
+  auto p = nnvm::Node::Create();
+  p->attrs.op = nnvm::Op::Get("_trt_op");
+  op::TRTParam trt_param = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(g, params_map);
+  p->attrs.dict["serialized_output_map"] = trt_param.serialized_output_map;
+  p->attrs.dict["serialized_input_map"]  = trt_param.serialized_input_map;
+  p->attrs.dict["serialized_onnx_graph"] = trt_param.serialized_onnx_graph;
+  if (p->op()->attr_parser != nullptr) {
+    p->op()->attr_parser(&(p->attrs));
+  }
+  return p;
+}
+
+/*!
+ * \brief Update attributes of the graph (such as some inputs properties)
+ */
+Graph UpdateSubgraphAttrs(Graph&& subgraph, const Graph& g,
+                          const std::unordered_map<nnvm::Node*, nnvm::NodePtr>& old2new,
+                          const nnvm::NodeEntryMap<nnvm::NodeEntry>& main_input_entry_to_sub) {
+  const auto& idx     = g.indexed_graph();
+  const auto& sub_idx = subgraph.indexed_graph();
+
+  const auto& shape               = g.GetAttr<nnvm::ShapeVector>("shape");
+  const auto& dtype               = g.GetAttr<nnvm::DTypeVector>("dtype");
+  const auto& storage_type        = g.GetAttr<StorageTypeVector>("storage_type");
+  const auto& shape_inputs        = g.GetAttr<nnvm::ShapeVector>("shape_inputs");
+  const auto& dtype_inputs        = g.GetAttr<nnvm::DTypeVector>("dtype_inputs");
+  const auto& storage_type_inputs = g.GetAttr<StorageTypeVector>("storage_type_inputs");
+
+  nnvm::ShapeVector sub_shape(sub_idx.num_node_entries());
+  nnvm::DTypeVector sub_dtype(sub_idx.num_node_entries());
+  StorageTypeVector sub_storage_type(sub_idx.num_node_entries());
+  nnvm::ShapeVector sub_shape_inputs(sub_idx.input_nodes().size());
+  nnvm::DTypeVector sub_dtype_inputs(sub_idx.input_nodes().size());
+  StorageTypeVector sub_storage_type_inputs(sub_idx.input_nodes().size());
+
+  const std::unordered_map<uint32_t, uint32_t> inputsindex2pos     = GetGraphInputsMap(g);
+  const std::unordered_map<uint32_t, uint32_t> sub_inputsindex2pos = GetGraphInputsMap(subgraph);
+  // map attributes from graph to subgraph
+  for (auto& p : old2new) {
+    const uint32_t nid     = idx.node_id(p.first);
+    const uint32_t sub_nid = sub_idx.node_id(p.second.get());
+    const nnvm::Op* op = sub_idx[sub_nid].source->op();
+    if (op == nullptr) {  // if it's an input node, there is only one output node entry
+      const uint32_t sub_i       = sub_idx.entry_id(sub_nid, 0);
+      const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
+      const uint32_t i           = idx.entry_id(nid, 0);
+
+      sub_shape[sub_i] = shape[i];
+      sub_dtype[sub_i] = dtype[i];
+      sub_storage_type[sub_i]       = storage_type[i];
+      sub_shape_inputs[sub_input_i] = shape_inputs[inputsindex2pos.at(nid)];
+      sub_dtype_inputs[sub_input_i] = dtype_inputs[inputsindex2pos.at(nid)];
+      sub_storage_type_inputs[sub_input_i] = storage_type_inputs[inputsindex2pos.at(nid)];
+
+    } else {
+      for (size_t oi = 0; oi < op->num_outputs; ++oi) {
+        const uint32_t sub_i = sub_idx.entry_id(sub_nid, oi);
+        const uint32_t i = idx.entry_id(nid, oi);
+          sub_shape[sub_i] = shape[i];
+          sub_dtype[sub_i] = dtype[i];
+          sub_storage_type[sub_i] = storage_type[i];
+      }
+    }
+  }
+  // old2new doesn't contain placeholder / interfaces
+  for (auto& p : main_input_entry_to_sub) {
+    nnvm::NodeEntry main_entry = p.first;
+    nnvm::NodeEntry sub_entry = p.second;
+    const uint32_t sub_nid = sub_idx.node_id(sub_entry.node.get());
+    const uint32_t sub_i = sub_idx.entry_id(sub_entry);
+    const uint32_t i = idx.entry_id(main_entry);
+    const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
+    sub_shape[sub_i] = shape[i];
+    sub_dtype[sub_i] = dtype[i];
+    sub_storage_type[sub_i] = storage_type[i];
+    sub_shape_inputs[sub_input_i] = sub_shape[sub_i];
+    sub_dtype_inputs[sub_input_i] = sub_dtype[sub_i];
+    sub_storage_type_inputs[sub_input_i] = sub_storage_type[sub_i];
+  }
+  subgraph.attrs["shape"] =
+      std::make_shared<dmlc::any>(std::move(sub_shape));
+  subgraph.attrs["dtype"] =
+      std::make_shared<dmlc::any>(std::move(sub_dtype));
+  subgraph.attrs["storage_type"] =
+      std::make_shared<dmlc::any>(std::move(sub_storage_type));
+  subgraph.attrs["shape_inputs"] =
+      std::make_shared<dmlc::any>(std::move(sub_shape_inputs));
+  subgraph.attrs["dtype_inputs"] =
+      std::make_shared<dmlc::any>(std::move(sub_dtype_inputs));
+  subgraph.attrs["storage_type_inputs"] =
+      std::make_shared<dmlc::any>(std::move(sub_storage_type_inputs));
+
+  return subgraph;
+}
+
+/*!
+ * \brief Generate a name for a new TRT node, avoid collision if some TRT_nodes are already defined
+ */
+const std::string GetNewTrtName(const Graph& g, const Graph& subgraph) {
+  const std::string name_prefix("TRT_node");
+  std::unordered_set<std::string> name_set;
+  DFSVisit(g.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
+    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
+      name_set.insert(node->attrs.name);
+    }
+  });
+  // name inside the subgraph will be avaible as they will be removed
+  DFSVisit(subgraph.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
+    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
+      name_set.erase(node->attrs.name);
+    }
+  });
+  uint32_t name_suffix = 0;
+  std::string full_name = name_prefix + std::to_string(name_suffix);
+  while (name_set.count(full_name)) {
+    full_name = name_prefix + std::to_string(++name_suffix);
+  }
+  return full_name;
+}
+
+/*!
+ * \brief helper function to display what nodes are in a specific subset
+ */
+void dispNodesSet(Graph g, std::unordered_set<nnvm::Node*> s) {
+  DFSVisit(g.outputs, [&s](const nnvm::NodePtr n){
+    if (s.count(n.get())) {
+      std::cout << "  Y " << n->attrs.name << std::endl;
+    } else {
+      std::cout << "  N " << n->attrs.name << std::endl;
+    }
+  });
+}
+
+/*!
+ * \brief Replace a set of nodes by a TensorRT node
+ */
+Graph ReplaceSubgraph(Graph&& g,
+                      const std::unordered_set<nnvm::Node*>& set_subgraph,
+                      std::unordered_map<std::string, NDArray>* const params_map) {
+  // Create MXNet subgraph
+  Graph subgraph;
+
+  const auto sub_outputs_in_main = GetSubgraphNodeEntries(g, set_subgraph);
+  subgraph.outputs = sub_outputs_in_main;
+  // old2new will link raw pointer of the nodes in the graph to
+  // the corresponding shared_ptr of the nodes in the generated subgraph
+  std::unordered_map<nnvm::Node*, nnvm::NodePtr> old2new;
+  std::deque<nnvm::Node*> stack;
+  std::unordered_set<nnvm::Node*> visited;
+  int32_t reservation = set_subgraph.size();
+  old2new.reserve(reservation);
+  visited.reserve(reservation);
+
+  // Create the shared_ptr using the same raw pointer don't really matter
+  for (auto& n : set_subgraph) {
+    old2new[n] = std::make_shared<nnvm::Node>(*n);
+  }
+
+  // To generate a subgraph an input have to be replace by data node (no op)
+  // and it have to be agnostic to the node from which it's an output
+  // (For exemple even if two inputs are two different outputs from the same node)
+  nnvm::NodeEntryMap<nnvm::NodeEntry> main_input_entry_to_sub;
+  for (auto& e : GetSubgraphInterfaceNodes(g, set_subgraph)) {
+    auto node = nnvm::Node::Create();
+    node->attrs.name = e.node->attrs.name + "_" + std::to_string(e.index);
+    auto new_e = nnvm::NodeEntry{node, 0, 0};
+    main_input_entry_to_sub[e] = new_e;
+  }
+
+  for (nnvm::NodeEntry& e : subgraph.outputs) {
+    e.node = old2new[e.node.get()];
+    stack.emplace_back(e.node.get());
+  }
+  // link all nodes in the subgraph to nodes in the subgraph instead of main graph
+  while (!stack.empty()) {
+    auto vertex = stack.front();
+    stack.pop_front();
+    if (!visited.count(vertex)) {
+      visited.insert(vertex);
+      for (auto& e : vertex->inputs) {
+        auto it = main_input_entry_to_sub.find(e);
+        if (it != main_input_entry_to_sub.end()) {
+          e = it->second;
+        } else {
+          e.node = old2new[e.node.get()];
+        }
+      stack.emplace_back(e.node.get());
+      }
+    }
+  }
+  // Remove the control dependencies of the subgraph to nodes that are not in the subgraph
+  DFSVisit(subgraph.outputs, [&set_subgraph, &old2new](const nnvm::NodePtr& node) {
+    std::remove_if(node->control_deps.begin(),
+                   node->control_deps.end(),
+                   [&set_subgraph](nnvm::NodePtr n_ptr) {
+                    return !set_subgraph.count(n_ptr.get());
+                   });
+    for (nnvm::NodePtr& n_ptr : node->control_deps) {
+      n_ptr = old2new[n_ptr.get()];
+    }
+  });
+
+  subgraph = UpdateSubgraphAttrs(std::move(subgraph), g, old2new, main_input_entry_to_sub);
+  auto& sub_idx = subgraph.indexed_graph();
+
+  auto trtnodeptr = ConvertNnvmGraphToOnnx(subgraph, params_map);
+  trtnodeptr->attrs.name = GetNewTrtName(g, subgraph);
+
+  // Insert new trt node and unplug replaced nodes
+  std::unordered_map<uint32_t, nnvm::NodeEntry> sub_input_entryid_to_main;
+  for (auto& p : main_input_entry_to_sub) {
+    sub_input_entryid_to_main[sub_idx.entry_id(p.second)] = p.first;
+  }
+
+  // Plug the nodes from the main graph as inputs of the trt node
+  trtnodeptr->inputs.resize(main_input_entry_to_sub.size());
+  {
+    uint32_t counter = 0;
+    for (uint32_t i : sub_idx.input_nodes()) {
+      auto it = sub_input_entryid_to_main.find(sub_idx.entry_id(i, 0));
+      if (it != sub_input_entryid_to_main.end()) {
+        trtnodeptr->inputs[counter++] = it->second;
+      }
+    }
+  }
+  nnvm::NodeEntryMap<uint32_t> sub_outputs_in_main_to_pos;
+  for (uint32_t i = 0; i < sub_outputs_in_main.size(); ++i) {
+    sub_outputs_in_main_to_pos[sub_outputs_in_main[i]] = i;
+  }
+  // Plug the trt node as inputs to the main graph nodes
+  DFSVisit(g.outputs, [&sub_outputs_in_main_to_pos, &trtnodeptr](const nnvm::NodePtr& n) {
+    for (auto& e : n->inputs) {
+      auto it = sub_outputs_in_main_to_pos.find(e);
+      if (it != sub_outputs_in_main_to_pos.end()) {
+        e.index = it->second;
+        e.node = trtnodeptr;
+      }
+    }
+  });
+
+  for (auto& output : g.outputs) {
+    auto it = sub_outputs_in_main_to_pos.find(output);
+    if (it != sub_outputs_in_main_to_pos.end()) {
+      output.index = it->second;
+      output.node = trtnodeptr;
+    }
+  }
+
+  Graph new_graph;
+  new_graph.outputs = g.outputs;
+  return new_graph;
+}
+
+std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
+    std::unordered_map<std::string, NDArray>* const params_map) {
+  BidirectionalGraph biG = BidirectionalGraph(g);
+  std::vector<std::unordered_set<BidirectionalGraph::Node*>> subsets = biG.get_subsets(params_map);
+  std::vector<std::unordered_set<nnvm::Node*>> nnvm_subsets(subsets.size(),
+                                                            std::unordered_set<nnvm::Node*>());
+  for (size_t i = 0; i < subsets.size(); ++i) {
+    nnvm_subsets[i].reserve(subsets[i].size());
+    for (auto& n : subsets[i]) {
+      nnvm_subsets[i].insert(n->nnvmptr);
+    }
+  }
+  return nnvm_subsets;
+}
+
+}  // namespace exec
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.cc b/src/executor/trt_graph_executor.cc
new file mode 100644
index 000000000000..65dbb29792e0
--- /dev/null
+++ b/src/executor/trt_graph_executor.cc
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_TENSORRT
+
+#include "trt_graph_executor.h"
+
+#include <onnx/onnx.pb.h>
+#include <NvInfer.h>
+#include "./onnx_to_tensorrt.h"
+#include "../operator/contrib/tensorrt-inl.h"
+#include "../common/utils.h"
+#include "../common/exec_utils.h"
+
+
+namespace mxnet {
+namespace exec {
+
+using namespace mxnet::common;
+
+  /*!
+ * \brief TrtGraphExecutor initializer for simple bind flow in
+ * which only certain input shapes and dtypes are provided by users.
+ * The initializer uses these shapes and dtypes to perform
+ * shape and dtype inferences, and then create NDArrays
+ * to populate data entries of the graph. The created NDArrays
+ * for in_args, arg_grads and aux_states are passed to the
+ * front end to attach the created executor.
+ * In front end, if the simple_bind flow is trigger by
+ * _bind_ith_exec, the shared data arrays of DataParallelExecutorGroup
+ * and shared executor will be taken into account in creating
+ * NDArrays for in_args, arg_grads, and aux_states for reusing
+ * already allocated memory.
+ *
+ * This version of an executor exports the computation graph to TensorRT make use of fused
+ * kernels and other runtime enhancements.  TRT will compile the sub-graphs to executable fused
+ * operators without intervention from the user.  Operators in the original graph that are not
+ * supported by TRT will continue to be executed normally by MXNet.
+ *
+ */
+void TrtGraphExecutor::Init(nnvm::Symbol symbol,
+                            const Context& default_ctx,
+                            const std::map<std::string, Context>& ctx_map,
+                            std::vector<Context> *in_arg_ctxes,
+                            std::vector<Context> *arg_grad_ctxes,
+                            std::vector<Context> *aux_state_ctxes,
+                            std::unordered_map<std::string, TShape> *arg_shape_map,
+                            std::unordered_map<std::string, int> *arg_dtype_map,
+                            std::unordered_map<std::string, int> *arg_stype_map,
+                            std::vector<OpReqType> *grad_req_types,
+                            const std::unordered_set<std::string>& shared_arg_names,
+                            std::vector<NDArray>* in_arg_vec,
+                            std::vector<NDArray>* arg_grad_vec,
+                            std::vector<NDArray>* aux_state_vec,
+                            std::unordered_map<std::string, NDArray>* shared_buffer,
+                            Executor* shared_exec,
+                            const nnvm::NodeEntryMap<NDArray>& feed_dict) {
+  symbol = symbol.Copy();
+  nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
+                            *aux_state_ctxes, *grad_req_types);
+
+  if (need_grad_) {
+    LOG(FATAL) << "You may be attempting to use TensorRT for training.  TensorRT is an inference "
+                  "only library.  To re-enable legacy MXNet graph execution, which will support "
+                  "training, set the MXNET_USE_TENSORRT environment variable to 0, or call "
+                  "mx.contrib.tensorrt.set_use_tensorrt(False)";
+  }
+
+  if (shared_buffer == nullptr || shared_buffer->empty()) {
+    LOG(FATAL) << "MXNET_USE_TENSORRT = 1 but shared_buffer is empty. "
+               << "Please provide weights and other parameters, such as "
+               << "BatchNorm moments, via the shared_buffer, during simple bind call.";
+  }
+
+  // The following code of shape and dtype inferences and argument
+  // initialization is for simple_bind only. Regular bind operation
+  // should do this differently.
+
+  // Initialize arg_shapes and arg_dtypes for shape and type inferences.
+  // It contains all in_args and aux_states' shapes and types in a certain order.
+  const nnvm::IndexedGraph& idx = g.indexed_graph();
+  nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape());
+  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
+  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
+  for (size_t i = 0; i < num_forward_inputs_; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const std::string& name = idx[nid].source->attrs.name;
+    auto it1 = arg_shape_map->find(name);
+    if (arg_shape_map->end() != it1) {
+      arg_shapes[i] = it1->second;
+    }
+    auto it2 = arg_dtype_map->find(name);
+    if (arg_dtype_map->end() != it2) {
+      arg_dtypes[i] = it2->second;
+    }
+    auto it3 = arg_stype_map->find(name);
+    if (arg_stype_map->end() != it3) {
+      arg_stypes[i] = it3->second;
+    }
+  }
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
+    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
+                          g.GetAttr<nnvm::ShapeVector>("shape"));
+  }
+
+  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
+  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
+    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
+                         g.GetAttr<nnvm::DTypeVector>("dtype"));
+  }
+
+  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
+  auto trt_groups = GetTrtCompatibleSubsets(g, shared_buffer);
+  for (auto trt_group : trt_groups) {
+    if (trt_group.size() > 1) {
+      g = ReplaceSubgraph(std::move(g), trt_group, shared_buffer);
+      g = ReinitGraph(std::move(g), default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
+                      aux_state_ctxes, grad_req_types, arg_shape_map, arg_dtype_map,
+                      arg_stype_map, shared_buffer);
+    }
+  }
+
+
+  InitArguments(g.indexed_graph(), g.GetAttr<nnvm::ShapeVector>("shape"),
+                g.GetAttr<nnvm::DTypeVector>("dtype"),
+                g.GetAttr<StorageTypeVector>("storage_type"),
+                *in_arg_ctxes, *arg_grad_ctxes, *aux_state_ctxes,
+                *grad_req_types, shared_arg_names, shared_exec,
+                shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec);
+
+  // The above code of shape and dtype inferences and argument
+  // initialization is for simple_bind only. Regular bind operation
+  // should do this differently.
+
+  // Initialize the rest attributes of the graph.
+  // This function can be called by regular bind
+  // operation flow as well.
+  FinishInitGraph(symbol, g, shared_exec, feed_dict);
+}
+/*!
+ * \brief Initialize in_args, arg_grads, and aux_states
+ * and their data_entry_ of the executor using
+ * shared_buffer from DataParallelExecutorGroup
+ * and shared_exec if available.
+ */
+void TrtGraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
+                                  const nnvm::ShapeVector& inferred_shapes,
+                                  const nnvm::DTypeVector& inferred_dtypes,
+                                  const StorageTypeVector& inferred_stypes,
+                                  const std::vector<Context>& in_arg_ctxes,
+                                  const std::vector<Context>& arg_grad_ctxes,
+                                  const std::vector<Context>& aux_state_ctxes,
+                                  const std::vector<OpReqType>& grad_req_types,
+                                  const std::unordered_set<std::string>& shared_arg_names,
+                                  const Executor* shared_exec,
+                                  std::unordered_map<std::string, NDArray>* shared_buffer,
+                                  std::vector<NDArray>* in_arg_vec,
+                                  std::vector<NDArray>* arg_grad_vec,
+                                  std::vector<NDArray>* aux_state_vec) {
+  // initialize in_args, arg_grads, and aux_states and populate grad_store_
+  data_entry_.resize(idx.num_node_entries());
+  size_t arg_top = 0, aux_top = 0;
+  const auto& mutable_nodes = idx.mutable_input_nodes();
+  for (size_t i = 0; i < num_forward_inputs_; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const TShape& inferred_shape = inferred_shapes[eid];
+    const int inferred_dtype = inferred_dtypes[eid];
+    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
+    const std::string& arg_name = idx[nid].source->attrs.name;
+    // aux_states
+    if (mutable_nodes.count(nid)) {
+      if (nullptr != shared_exec) {
+        const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name);
+        CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage)
+          << "Non-default storage type detected when creating auxilliary NDArray. The allocated "
+          << "memory of shared_exec.aux_array cannot be resued for argument: "
+          << arg_name << " for the current executor";
+        CHECK_EQ(inferred_shape, aux_nd.shape())
+          << "Inferred shape does not match shared_exec.aux_array's shape."
+             " Therefore, the allocated memory for shared_exec.aux_array cannot"
+             " be resued for creating auxilliary NDArray of the argument: "
+          << arg_name << " for the current executor";
+        CHECK_EQ(inferred_dtype, aux_nd.dtype())
+          << "Inferred dtype does not match shared_exec.aux_array's dtype."
+             " Therefore, the allocated memory for shared_exec.aux_array cannot"
+             " be resued for creating auxilliary NDArray of the argument: "
+          << arg_name << " for the current executor";
+        aux_state_vec->emplace_back(aux_nd);
+      } else {
+        auto it = shared_buffer->find(arg_name);
+        if (it != shared_buffer->end()) {
+          aux_state_vec->push_back(std::move(it->second.Copy(aux_state_ctxes[aux_top])));
+        } else {
+          aux_state_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
+                                                       aux_state_ctxes[aux_top], inferred_dtype)));
+        }
+      }  // if (has_shared_exec)
+      data_entry_[eid] = aux_state_vec->back();
+      aux_state_map_.emplace(arg_name, aux_state_vec->back());
+      ++aux_top;
+    } else {  // in_args and grad for in_args
+      if (shared_arg_names.count(arg_name)) {  // model parameter
+        // model parameter
+        if (nullptr != shared_exec) {
+          const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
+          auto arg_nd_stype = in_arg_nd.storage_type();
+          // for model parameter, both default storage and row_sparse storage can be shared
+          bool shareable_arg_stype = inferred_stype == kDefaultStorage ||
+                                     inferred_stype == kRowSparseStorage;
+          // try to reuse memory from shared_exec
+          CHECK(shareable_arg_stype) << "Inferred storage type "
+            << common::stype_string(inferred_stype)
+            << " does not support memory sharing with shared_exec.arg_array";
+          CHECK_EQ(inferred_stype, arg_nd_stype)
+            << "Inferred stype does not match shared_exec.arg_array's stype"
+               " Therefore, the allocated memory for shared_exec.arg_array cannot"
+               " be resued for creating NDArray of the argument "
+            << arg_name << " for the current executor";
+          CHECK_EQ(inferred_shape, in_arg_nd.shape())
+            << "Inferred shape does not match shared_exec.arg_array's shape"
+               " Therefore, the allocated memory for shared_exec.arg_array cannot"
+               " be resued for creating NDArray of the argument "
+            << arg_name << " for the current executor";
+          CHECK_EQ(inferred_dtype, in_arg_nd.dtype())
+            << "Inferred dtype does not match shared_exec.arg_array's dtype"
+               " Therefore, the allocated memory for shared_exec.arg_array cannot"
+               " be resued for creating NDArray of the argument "
+            << arg_name << " for the current executor";
+          in_arg_vec->emplace_back(in_arg_nd);
+        } else {
+          // doesn't have shared_exec, or non-default storage
+          EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
+                           inferred_dtype, in_arg_vec);
+        }
+        // gradient for model parameter
+        if (kNullOp == grad_req_types[arg_top]) {
+          arg_grad_vec->emplace_back();
+        } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
+              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
+            // try to reuse memory from shared_exec
+            arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
+          } else {
+            // no need to reuse memory from shared_exec for gradient of non-default storage
+            EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
+                             inferred_dtype, arg_grad_vec);
+          }
+          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
+        }
+      } else {  // !shared_arg_names.count(arg_name)
+        // model parameter, row_sparse ndarray sharing enabled
+        auto it = shared_buffer->find(arg_name);
+        if (it != shared_buffer->end()) {
+          in_arg_vec->push_back(std::move(it->second.Copy(in_arg_ctxes[arg_top])));
+        } else {
+          in_arg_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
+                                                    in_arg_ctxes[arg_top], inferred_dtype)));
+        }
+        // gradient for model parameter, row_sparse ndarray sharing disabled
+        if (kNullOp == grad_req_types[arg_top]) {
+          arg_grad_vec->emplace_back();
+        } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+          bool enable_row_sparse_sharing = false;
+          arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape,
+                                                     inferred_dtype, grad_stype,
+                                                     arg_grad_ctxes[arg_top], shared_buffer,
+                                                     enable_row_sparse_sharing));
+          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
+        }  // if (kNullOp == grad_req_types[arg_top])
+      }  // if (shared_arg_names.count(arg_name))
+      in_arg_map_.emplace(arg_name, in_arg_vec->back());
+      if (!arg_grad_vec->back().is_none()) {
+        arg_grad_map_.emplace(arg_name, arg_grad_vec->back());
+      }
+      data_entry_[eid] = in_arg_vec->back();
+      ++arg_top;
+    }
+  }
+}
+
+
+  /*!
+ * \brief This function is triggered after each tensorrt subgraph replacement pass.
+ * Reset arguments of GraphExecutor::Init(...) as some variables (weights and biases)
+ * are absorbed into the TRT engine it also it reruns attributes inferences accordingly
+ * to the new topology.
+ */
+Graph TrtGraphExecutor::ReinitGraph(Graph&& g, const Context &default_ctx,
+                                 const std::map<std::string, Context> &ctx_map,
+                                 std::vector<Context> *in_arg_ctxes,
+                                 std::vector<Context> *arg_grad_ctxes,
+                                 std::vector<Context> *aux_state_ctxes,
+                                 std::vector<OpReqType> *grad_req_types,
+                                 std::unordered_map<std::string, TShape> *arg_shape_map,
+                                 std::unordered_map<std::string, int> *arg_dtype_map,
+                                 std::unordered_map<std::string, int> *arg_stype_map,
+                                 std::unordered_map<std::string, NDArray> *params_map) {
+  std::unordered_set<std::string> to_remove_params;
+  for (auto& el : *params_map) {
+    to_remove_params.insert(el.first);
+  }
+
+  DFSVisit(g.outputs, [&to_remove_params](const nnvm::NodePtr n) {
+    to_remove_params.erase(n->attrs.name);
+  });
+
+  for (auto& el : to_remove_params) {
+    params_map->erase(el);
+    arg_shape_map->erase(el);
+    arg_dtype_map->erase(el);
+    arg_stype_map->erase(el);
+  }
+  const auto &idx = g.indexed_graph();
+  num_forward_inputs_ = idx.input_nodes().size();
+  in_arg_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
+  arg_grad_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
+  grad_req_types->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
+  aux_state_ctxes->resize(idx.mutable_input_nodes().size());
+
+  // create "device" and "context" attrs for the graph
+  g = AssignContext(g, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
+                    *aux_state_ctxes, *grad_req_types, num_forward_inputs_,
+                    num_forward_outputs_);
+
+  // get number of nodes used in forward pass
+  num_forward_nodes_ = 0;
+  for (size_t i = 0; i < num_forward_outputs_; ++i) {
+    num_forward_nodes_ = std::max(
+        num_forward_nodes_, static_cast<size_t>(idx.outputs()[i].node_id + 1));
+  }
+  nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape());
+  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
+  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
+  for (size_t i = 0; i < num_forward_inputs_; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const std::string &name = idx[nid].source->attrs.name;
+    auto it1 = arg_shape_map->find(name);
+    if (arg_shape_map->end() != it1) {
+      arg_shapes[i] = it1->second;
+    }
+    auto it2 = arg_dtype_map->find(name);
+    if (arg_dtype_map->end() != it2) {
+      arg_dtypes[i] = it2->second;
+    }
+    auto it3 = arg_stype_map->find(name);
+    if (arg_stype_map->end() != it3) {
+      arg_stypes[i] = it3->second;
+    }
+  }
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
+    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
+                          g.GetAttr<nnvm::ShapeVector>("shape"));
+  }
+
+  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
+  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
+    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
+                         g.GetAttr<nnvm::DTypeVector>("dtype"));
+  }
+
+  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
+
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
+  return g;
+}
+
+
+/*!
+ * \brief Return the "optimized" symbol contained in the graph.
+ * For optimization pass such as TensorRT pass
+ */
+nnvm::Symbol TrtGraphExecutor::GetOptimizedSymbol() {
+  Symbol ret;
+  ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
+                                             graph_.outputs.begin() + num_forward_outputs_);
+  ret = ret.Copy();
+  static const Op* trt_op = Op::Get("_trt_op");
+  DFSVisit(ret.outputs, [](const nnvm::NodePtr n) {
+    if (n->op() == trt_op) {
+      n->attrs.dict.clear();
+    }
+  });
+  return ret;
+}
+
+Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol,
+                                         const Context &default_ctx,
+                                         const std::map<std::string, Context> &group2ctx,
+                                         std::vector<Context> *in_arg_ctxes,
+                                         std::vector<Context> *arg_grad_ctxes,
+                                         std::vector<Context> *aux_state_ctxes,
+                                         std::unordered_map<std::string, TShape> *arg_shape_map,
+                                         std::unordered_map<std::string, int> *arg_dtype_map,
+                                         std::unordered_map<std::string, int> *arg_stype_map,
+                                         std::vector<OpReqType> *grad_req_types,
+                                         const std::unordered_set<std::string> &param_names,
+                                         std::vector<NDArray> *in_args,
+                                         std::vector<NDArray> *arg_grads,
+                                         std::vector<NDArray> *aux_states,
+                                         std::unordered_map<std::string, NDArray> *shared_buffer,
+                                         Executor *shared_exec) {
+  auto exec = new exec::TrtGraphExecutor();
+  exec->Init(symbol, default_ctx, group2ctx,
+             in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
+             arg_shape_map, arg_dtype_map, arg_stype_map,
+             grad_req_types, param_names,
+             in_args, arg_grads, aux_states,
+             shared_buffer, shared_exec);
+  return exec;
+}
+
+}  // namespace exec
+
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.h b/src/executor/trt_graph_executor.h
new file mode 100644
index 000000000000..96ac4426270a
--- /dev/null
+++ b/src/executor/trt_graph_executor.h
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
+#define MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
+
+#if MXNET_USE_TENSORRT
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "./graph_executor.h"
+
+namespace mxnet {
+
+namespace exec {
+
+class TrtGraphExecutor : public GraphExecutor {
+ public:
+  static Executor* TensorRTBind(nnvm::Symbol symbol,
+                                const Context& default_ctx,
+                                const std::map<std::string, Context>& group2ctx,
+                                std::vector<Context> *in_arg_ctxes,
+                                std::vector<Context>* arg_grad_ctxes,
+                                std::vector<Context>* aux_state_ctxes,
+                                std::unordered_map<std::string, TShape>* arg_shape_map,
+                                std::unordered_map<std::string, int>* arg_dtype_map,
+                                std::unordered_map<std::string, int>* arg_stype_map,
+                                std::vector<OpReqType>* grad_req_types,
+                                const std::unordered_set<std::string>& param_names,
+                                std::vector<NDArray>* in_args,
+                                std::vector<NDArray>* arg_grads,
+                                std::vector<NDArray>* aux_states,
+                                std::unordered_map<std::string, NDArray>*
+                                shared_data_arrays = nullptr,
+                                Executor* shared_exec = nullptr);
+
+  virtual void Init(nnvm::Symbol symbol,
+                    const Context& default_ctx,
+                    const std::map<std::string, Context>& ctx_map,
+                    std::vector<Context> *in_arg_ctxes,
+                    std::vector<Context> *arg_grad_ctxes,
+                    std::vector<Context> *aux_state_ctxes,
+                    std::unordered_map<std::string, TShape> *arg_shape_map,
+                    std::unordered_map<std::string, int> *arg_dtype_map,
+                    std::unordered_map<std::string, int> *arg_stype_map,
+                    std::vector<OpReqType> *grad_req_types,
+                    const std::unordered_set<std::string>& shared_arg_names,
+                    std::vector<NDArray>* in_arg_vec,
+                    std::vector<NDArray>* arg_grad_vec,
+                    std::vector<NDArray>* aux_state_vec,
+                    std::unordered_map<std::string, NDArray>* shared_buffer = nullptr,
+                    Executor* shared_exec = nullptr,
+                    const nnvm::NodeEntryMap<NDArray>& feed_dict
+                      = nnvm::NodeEntryMap<NDArray>());
+
+  // Returns symbol representing the TRT optimized graph for comparison purposes.
+  nnvm::Symbol GetOptimizedSymbol();
+
+ protected:
+  Graph ReinitGraph(Graph&& g, const Context &default_ctx,
+        const std::map<std::string, Context> &ctx_map,
+        std::vector<Context> *in_arg_ctxes,
+        std::vector<Context> *arg_grad_ctxes,
+        std::vector<Context> *aux_state_ctxes,
+        std::vector<OpReqType> *grad_req_types,
+        std::unordered_map<std::string, TShape> *arg_shape_map,
+        std::unordered_map<std::string, int> *arg_dtype_map,
+        std::unordered_map<std::string, int> *arg_stype_map,
+        std::unordered_map<std::string, NDArray> *params_map);
+
+  void InitArguments(const nnvm::IndexedGraph& idx,
+                     const nnvm::ShapeVector& inferred_shapes,
+                     const nnvm::DTypeVector& inferred_dtypes,
+                     const StorageTypeVector& inferred_stypes,
+                     const std::vector<Context>& in_arg_ctxes,
+                     const std::vector<Context>& arg_grad_ctxes,
+                     const std::vector<Context>& aux_state_ctxes,
+                     const std::vector<OpReqType>& grad_req_types,
+                     const std::unordered_set<std::string>& shared_arg_names,
+                     const Executor* shared_exec,
+                     std::unordered_map<std::string, NDArray>* shared_buffer,
+                     std::vector<NDArray>* in_arg_vec,
+                     std::vector<NDArray>* arg_grad_vec,
+                     std::vector<NDArray>* aux_state_vec) override;
+};
+
+}  // namespace exec
+
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
diff --git a/src/operator/contrib/adaptive_avg_pooling-inl.h b/src/operator/contrib/adaptive_avg_pooling-inl.h
index 7331c7bd47a1..12284d9d85d2 100644
--- a/src/operator/contrib/adaptive_avg_pooling-inl.h
+++ b/src/operator/contrib/adaptive_avg_pooling-inl.h
@@ -144,41 +144,6 @@ static bool AdaptiveAvgPoolOpInferShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-static bool AdaptiveAvgPoolOpInferType(const nnvm::NodeAttrs& attrs,
-                                       std::vector<int> *in_type,
-                                       std::vector<int> *out_type) {
-  using namespace mshadow;
-  CHECK_EQ(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  // For float16 input type beta, gamma, mean, and average are stored in float32.
-  // For other input types, these parameters have the same type as input
-  // NOTE: This requirement is from cuDNN (v. 4 and 5)
-  int dtype_param = 0;
-  MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-  out_type->clear();
-  out_type->push_back(dtype_param);
-  return true;
-}
-
-static inline bool AdaptiveAvgPoolOpStorageType(const nnvm::NodeAttrs &attrs,
-                                                const int dev_mask,
-                                                DispatchMode *dispatch_mode,
-                                                std::vector<int> *in_attrs,
-                                                std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  *dispatch_mode = DispatchMode::kFCompute;
-  for (int& v : *in_attrs) {
-    if (v == - 1) v = kDefaultStorage;
-  }
-  for (size_t i = 0; i < out_attrs->size(); i++) {
-    (*out_attrs)[i] = kDefaultStorage;
-  }
-  return true;
-}
-
 using namespace mshadow;
 template<typename xpu, int Dim, typename DType>
 MSHADOW_XINLINE int get_stride(Tensor<xpu, Dim, DType> tensor, int idx) {
diff --git a/src/operator/contrib/adaptive_avg_pooling.cc b/src/operator/contrib/adaptive_avg_pooling.cc
index 079571177cbf..00ab36605bf4 100644
--- a/src/operator/contrib/adaptive_avg_pooling.cc
+++ b/src/operator/contrib/adaptive_avg_pooling.cc
@@ -216,8 +216,6 @@ The pooling kernel and stride sizes are automatically chosen for desired output
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", AdaptiveAvgPoolOpInferShape)
-.set_attr<nnvm::FInferType>("FInferType", AdaptiveAvgPoolOpInferType)
-.set_attr<FInferStorageType>("FInferStorageType", AdaptiveAvgPoolOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", AdaptiveAvgPoolOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   ElemwiseGradUseNone{"_backward_contrib_AdaptiveAvgPooling2D"})
@@ -229,7 +227,6 @@ NNVM_REGISTER_OP(_backward_contrib_AdaptiveAvgPooling2D)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", AdaptiveAvgPoolOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", AdaptiveAvgPoolOpBackward<cpu>);
 
 
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index c096f0149751..ff3f794d167d 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -136,42 +136,6 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-static bool BilinearSampleOpInferType(const nnvm::NodeAttrs& attrs,
-                                      std::vector<int> *in_type,
-                                      std::vector<int> *out_type) {
-  using namespace mshadow;
-  CHECK_EQ(in_type->size(), 1U);
-  int dtype = (*in_type)[0];
-  CHECK_NE(dtype, -1) << "First input must have specified type";
-  // For float16 input type beta, gamma, mean, and average are stored in float32.
-  // For other input types, these parameters have the same type as input
-  // NOTE: This requirement is from cuDNN (v. 4 and 5)
-  int dtype_param = 0;
-  MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-  out_type->clear();
-  out_type->push_back(dtype_param);
-  return true;
-}
-
-static inline bool BilinearSampleOpStorageType(const nnvm::NodeAttrs &attrs,
-                                               const int dev_mask,
-                                               DispatchMode *dispatch_mode,
-                                               std::vector<int> *in_attrs,
-                                               std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  *dispatch_mode = DispatchMode::kFCompute;
-  for (int& v : *in_attrs) {
-    if (v == - 1) v = kDefaultStorage;
-  }
-  for (size_t i = 0; i < out_attrs->size(); i++) {
-    (*out_attrs)[i] = kDefaultStorage;
-  }
-  return true;
-}
-
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/contrib/bilinear_resize.cc b/src/operator/contrib/bilinear_resize.cc
index e1248ce97bbf..074f74aefcc9 100644
--- a/src/operator/contrib/bilinear_resize.cc
+++ b/src/operator/contrib/bilinear_resize.cc
@@ -177,8 +177,6 @@ for more details.
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", BilinearSampleOpInferShape)
-.set_attr<nnvm::FInferType>("FInferType", BilinearSampleOpInferType)
-.set_attr<FInferStorageType>("FInferStorageType", BilinearSampleOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", BilinearSampleOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   ElemwiseGradUseNone{"_backward_contrib_BilinearResize2D"})
@@ -190,7 +188,6 @@ NNVM_REGISTER_OP(_backward_contrib_BilinearResize2D)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BilinearSampleOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", BilinearSampleOpBackward<cpu>);
 
 
diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh
index d8742769b654..5914184d5bbe 100644
--- a/src/operator/contrib/nn/deformable_im2col.cuh
+++ b/src/operator/contrib/nn/deformable_im2col.cuh
@@ -510,7 +510,7 @@ inline void deformable_col2im_coord(mshadow::Stream<gpu>* s,
         num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3],
         kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
         dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset, req);
-    MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_coord_gpu_kernel);
     break;
   default:
     LOG(FATAL) << "col2im_nd_gpu does not support computation with "
diff --git a/src/operator/contrib/nnvm_to_onnx-inl.h b/src/operator/contrib/nnvm_to_onnx-inl.h
new file mode 100644
index 000000000000..58f88b051433
--- /dev/null
+++ b/src/operator/contrib/nnvm_to_onnx-inl.h
@@ -0,0 +1,156 @@
+#ifndef MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+#define MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file tensorrt-inl.h
+ * \brief TensorRT Operator
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
+#include <dmlc/serializer.h>
+#include <dmlc/parameter.h>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass_functions.h>
+
+#include <NvInfer.h>
+#include <onnx/onnx.pb.h>
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <vector>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <string>
+
+#include "./tensorrt-inl.h"
+#include "../operator_common.h"
+#include "../../common/utils.h"
+#include "../../common/serialization.h"
+
+namespace mxnet {
+namespace op {
+namespace nnvm_to_onnx {
+
+using namespace nnvm;
+using namespace ::onnx;
+using int64 = ::google::protobuf::int64;
+
+std::unordered_map<std::string, TShape> GetPlaceholderShapes(const ShapeVector& shape_inputs,
+    const nnvm::IndexedGraph& ig);
+
+std::unordered_map<std::string, uint32_t> GetOutputLookup(const nnvm::IndexedGraph& ig);
+
+void ConvertPlaceholder(
+  const std::string& node_name,
+  const std::unordered_map<std::string, TShape>& placeholder_shapes,
+  GraphProto* const graph_proto);
+
+void ConvertConstant(GraphProto* const graph_proto,
+  const std::string& node_name,
+  std::unordered_map<std::string, NDArray>* const shared_buffer);
+
+void ConvertOutput(op::tensorrt::InferenceMap_t* const trt_output_map,
+                   GraphProto* const graph_proto,
+                   const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
+                   const std::string& node_name,
+                   const nnvm::Graph& g,
+                   const StorageTypeVector& storage_types,
+                   const DTypeVector& dtypes);
+
+typedef void (*ConverterFunction)(NodeProto *node_proto,
+                                  const NodeAttrs &attrs,
+                                  const nnvm::IndexedGraph &ig,
+                                  const array_view<IndexedGraph::NodeEntry> &inputs);
+
+
+// Forward declarations
+void ConvertConvolution(
+                        NodeProto *node_proto,
+                        const NodeAttrs &attrs,
+                        const nnvm::IndexedGraph &ig,
+                        const array_view<IndexedGraph::NodeEntry> &inputs);
+
+
+void ConvertPooling(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertActivation(NodeProto *node_proto,
+                       const NodeAttrs &attrs,
+                       const nnvm::IndexedGraph &ig,
+                       const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertFullyConnected(NodeProto *node_proto,
+                           const NodeAttrs &attrs,
+                           const nnvm::IndexedGraph &ig,
+                           const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertSoftmaxOutput(NodeProto *node_proto,
+                          const NodeAttrs &attrs,
+                          const nnvm::IndexedGraph &ig,
+                          const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertFlatten(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertBatchNorm(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertElementwiseAdd(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+TRTParam ConvertNnvmGraphToOnnx(
+    const nnvm::Graph &g,
+    std::unordered_map<std::string, NDArray> *const shared_buffer);
+
+static const std::unordered_map<std::string, ConverterFunction> converter_map = {
+  {"Convolution", ConvertConvolution},
+  {"Pooling", ConvertPooling},
+  {"Activation", ConvertActivation},
+  {"FullyConnected", ConvertFullyConnected},
+  {"SoftmaxOutput", ConvertSoftmaxOutput},
+  {"Flatten", ConvertFlatten},
+  {"BatchNorm", ConvertBatchNorm},
+  {"elemwise_add", ConvertElementwiseAdd}};
+
+}  // namespace nnvm_to_onnx
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/contrib/nnvm_to_onnx.cc
new file mode 100644
index 000000000000..902466614c7c
--- /dev/null
+++ b/src/operator/contrib/nnvm_to_onnx.cc
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file trt.cc
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include "./nnvm_to_onnx-inl.h"
+
+#include <mxnet/base.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass_functions.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "../../common/serialization.h"
+#include "../../common/utils.h"
+#include "../../ndarray/ndarray_function.h"
+#include "../../operator/nn/activation-inl.h"
+#include "../../operator/nn/batch_norm-inl.h"
+#include "../../operator/nn/convolution-inl.h"
+#include "../../operator/nn/fully_connected-inl.h"
+#include "../../operator/nn/pooling-inl.h"
+#include "../../operator/softmax_output-inl.h"
+#include "./tensorrt-inl.h"
+
+#if MXNET_USE_TENSORRT_ONNX_CHECKER
+#include <onnx/checker.h>
+#endif  // MXNET_USE_TENSORRT_ONNX_CHECKER
+
+namespace mxnet {
+namespace op {
+namespace nnvm_to_onnx {
+
+op::TRTParam ConvertNnvmGraphToOnnx(
+    const nnvm::Graph& g,
+    std::unordered_map<std::string, NDArray>* const shared_buffer) {
+    op::TRTParam trt_param;
+    op::tensorrt::NameToIdx_t trt_input_map;
+    op::tensorrt::InferenceMap_t trt_output_map;
+
+  const nnvm::IndexedGraph& ig = g.indexed_graph();
+  const auto& storage_types = g.GetAttr<StorageTypeVector>("storage_type");
+  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
+  const auto& shape_inputs = g.GetAttr<ShapeVector>("shape_inputs");
+
+  for (auto& e : storage_types) {
+    if (e != mshadow::kFloat32) {
+      LOG(FATAL) << "ONNX converter does not support types other than float32 "
+                    "right now.";
+    }
+  }
+
+  ModelProto model_proto;
+  // Need to determine IR versions and features to support
+  model_proto.set_ir_version(static_cast<int64>(2));
+  GraphProto* graph_proto = model_proto.mutable_graph();
+
+  std::unordered_map<std::string, TShape> placeholder_shapes =
+      GetPlaceholderShapes(shape_inputs, ig);
+  std::unordered_map<std::string, uint32_t> output_lookup = GetOutputLookup(ig);
+  uint32_t current_input = 0;
+
+  // Can't do a foreach over IndexedGraph since it doesn't implement begin(), etc.
+  for (uint32_t node_idx = 0; node_idx < ig.num_nodes(); ++node_idx) {
+    const IndexedGraph::Node& node = ig[node_idx];
+    const nnvm::Node* source = node.source;
+    const NodeAttrs& attrs = source->attrs;
+    const Op* op = source->op();
+
+    std::string node_name = attrs.name;
+    // Here, "variable" actually means anything that's not an op i.e. a constant (weights) or a
+    // placeholder
+    if (source->is_variable()) {
+      // Is this a placeholder?
+      if (shared_buffer->count(node_name) == 0) {
+        // This fixes the problem with a SoftmaxOutput node during inference, but it's hacky.
+        // Need to figure out how to properly fix it.
+        if (node_name.find("label") != std::string::npos) {
+          current_input++;
+          continue;
+        }
+        trt_input_map.emplace(node_name, current_input++);
+        ConvertPlaceholder(node_name, placeholder_shapes, graph_proto);
+      } else {
+        // If it's not a placeholder, then by exclusion it's a constant.
+        ConvertConstant(graph_proto, node_name, shared_buffer);
+      }  // is_placeholder
+    } else {
+      // It's an op, rather than a "variable" (constant or placeholder)
+      NodeProto* node_proto = graph_proto->add_node();
+      node_proto->set_name(node_name);
+      if (converter_map.count(op->name) == 0) {
+        LOG(FATAL) << "Conversion for node of type " << op->name << " (node "
+                   << node_name << ") "
+                   << " is not supported yet.";
+      }
+      // Find function ptr to a converter based on the op name, and invoke the converter. This
+      // looks unsafe because find may not succeed, but it does because we're in the operator
+      // logic after testing that this node name does not represent a variable.
+      converter_map.find(op->name)->second(node_proto, attrs, ig, node.inputs);
+      // Add all inputs to the current node (i.e. add graph edges)
+      for (const nnvm::IndexedGraph::NodeEntry& entry : node.inputs) {
+        std::string in_node_name = ig[entry.node_id].source->attrs.name;
+        // As before, we're not adding labels e.g. for SoftmaxOutput, but I wish there was a less
+        // hacky way to do it than name matching.
+        if (in_node_name.find("label") != std::string::npos) {
+          continue;
+        }
+        node_proto->add_input(in_node_name);
+      }
+      // The node's output will have the same name as the node name.
+      node_proto->add_output(node_name);
+      // See if the current node is an output node
+      auto out_iter = output_lookup.find(node_name);
+      // We found an output
+      if (out_iter != output_lookup.end()) {
+        ConvertOutput(&trt_output_map, graph_proto, out_iter, node_name, g,
+                      storage_types, dtypes);
+      }  // output found
+    }    // conversion function exists
+  }      // loop over i from 0 to num_nodes
+
+  model_proto.SerializeToString(&trt_param.serialized_onnx_graph);
+  common::Serialize<op::tensorrt::NameToIdx_t>(trt_input_map,
+                                          &trt_param.serialized_input_map);
+  common::Serialize<op::tensorrt::InferenceMap_t>(trt_output_map,
+                                             &trt_param.serialized_output_map);
+
+#if MXNET_USE_TENSORRT_ONNX_CHECKER
+  onnx::checker::check_model(model_proto);
+#endif  // MXNET_USE_TENSORRT_ONNX_CHECKER
+
+  return trt_param;
+}
+
+void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
+                        const nnvm::IndexedGraph& /*ig*/,
+                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& conv_param = nnvm::get<op::ConvolutionParam>(attrs.parsed);
+
+  node_proto->set_op_type("Conv");
+
+  const TShape kernel = conv_param.kernel;
+  const TShape stride = conv_param.stride;
+  const TShape dilate = conv_param.dilate;
+  const TShape pad = conv_param.pad;
+  const uint32_t num_group = conv_param.num_group;
+  // const bool no_bias = conv_param.no_bias;
+  const dmlc::optional<int> layout = conv_param.layout;
+
+  // kernel shape
+  AttributeProto* const kernel_shape = node_proto->add_attribute();
+  kernel_shape->set_name("kernel_shape");
+  kernel_shape->set_type(AttributeProto::INTS);
+
+  for (const dim_t kval : kernel) {
+    kernel_shape->add_ints(static_cast<int64>(kval));
+  }
+
+  // pads
+  AttributeProto* const pads = node_proto->add_attribute();
+  pads->set_name("pads");
+  pads->set_type(AttributeProto::INTS);
+
+  for (const dim_t kval : pad) {
+    pads->add_ints(static_cast<int64>(kval));
+    pads->add_ints(static_cast<int64>(kval));
+  }
+
+  // dilations
+  AttributeProto* const dilations = node_proto->add_attribute();
+  dilations->set_name("dilations");
+  dilations->set_type(AttributeProto::INTS);
+  for (const dim_t kval : dilate) {
+    dilations->add_ints(static_cast<int64>(kval));
+  }
+
+  // strides
+  AttributeProto* const strides = node_proto->add_attribute();
+  strides->set_name("strides");
+  strides->set_type(AttributeProto::INTS);
+  for (const dim_t kval : stride) {
+    strides->add_ints(static_cast<int64>(kval));
+  }
+
+  // group
+  AttributeProto* const group = node_proto->add_attribute();
+  group->set_name("group");
+  group->set_type(AttributeProto::INT);
+  group->set_i(static_cast<int64>(num_group));
+}  // end ConvertConvolution
+
+void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
+                    const nnvm::IndexedGraph& /*ig*/,
+                    const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& pooling_param = nnvm::get<op::PoolingParam>(attrs.parsed);
+
+  const TShape kernel = pooling_param.kernel;
+  const TShape stride = pooling_param.stride;
+  const TShape pad = pooling_param.pad;
+  const int pool_type = pooling_param.pool_type;
+  const bool global_pool = pooling_param.global_pool;
+
+  if (global_pool) {
+    if (pool_type == 0) {
+      node_proto->set_op_type("GlobalMaxPool");
+    } else {
+      node_proto->set_op_type("GlobalAveragePool");
+    }
+    return;
+  }
+
+  // kernel_shape
+  AttributeProto* const kernel_shape = node_proto->add_attribute();
+  kernel_shape->set_name("kernel_shape");
+  kernel_shape->set_type(AttributeProto::INTS);
+  for (int kval : kernel) {
+    kernel_shape->add_ints(static_cast<int64>(kval));
+  }
+
+  // pads
+  AttributeProto* const pads = node_proto->add_attribute();
+  pads->set_name("pads");
+  pads->set_type(AttributeProto::INTS);
+  for (int kval : pad) {
+    pads->add_ints(static_cast<int64>(kval));
+  }
+
+  // strides
+  AttributeProto* const strides = node_proto->add_attribute();
+  strides->set_name("strides");
+  strides->set_type(AttributeProto::INTS);
+  for (int kval : stride) {
+    strides->add_ints(static_cast<int64>(kval));
+  }
+
+  if (pool_type == 0) {
+    node_proto->set_op_type("MaxPool");
+  } else {
+    node_proto->set_op_type("AveragePool");
+  }  // average pooling
+  // not global pooling
+}  // end ConvertPooling
+
+void ConvertActivation(NodeProto* node_proto, const NodeAttrs& attrs,
+                       const nnvm::IndexedGraph& /*ig*/,
+                       const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& act_param = nnvm::get<op::ActivationParam>(attrs.parsed);
+  std::string act_type;
+  switch (act_param.act_type) {
+    case op::activation::kReLU:
+      act_type = "Relu";
+      break;
+    case op::activation::kSigmoid:
+      act_type = "Sigmoid";
+      break;
+    case op::activation::kTanh:
+      act_type = "Tanh";
+      break;
+    case op::activation::kSoftReLU:
+      // act_type = "SoftReLU";
+      throw dmlc::Error("SoftReLU is not supported in ONNX");
+      break;
+    default:
+      throw dmlc::Error("Activation of such type doesn't exist");
+  }
+
+  node_proto->set_op_type(act_type);
+}
+
+void ConvertFullyConnected(NodeProto* node_proto, const NodeAttrs& attrs,
+                           const nnvm::IndexedGraph& /*ig*/,
+                           const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& act_param = nnvm::get<op::FullyConnectedParam>(attrs.parsed);
+  if (act_param.no_bias) {
+      node_proto->set_op_type("MatMul");
+  } else {
+      node_proto->set_op_type("Gemm");
+
+      AttributeProto* const alpha = node_proto->add_attribute();
+      alpha->set_name("alpha");
+      alpha->set_type(AttributeProto::FLOAT);
+      alpha->set_f(1.0f);
+
+      AttributeProto* const beta = node_proto->add_attribute();
+      beta->set_name("beta");
+      beta->set_type(AttributeProto::FLOAT);
+      beta->set_f(1.0f);
+
+      AttributeProto* const broadcast = node_proto->add_attribute();
+      broadcast->set_name("broadcast");
+      broadcast->set_type(AttributeProto::INT);
+      broadcast->set_i(1);
+
+      AttributeProto* const transA = node_proto->add_attribute();
+      transA->set_name("transA");
+      transA->set_type(AttributeProto::INT);
+      transA->set_i(0);
+
+      AttributeProto* const transB = node_proto->add_attribute();
+      transB->set_name("transB");
+      transB->set_type(AttributeProto::INT);
+      transB->set_i(1);
+  }
+}
+
+void ConvertSoftmaxOutput(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
+                          const nnvm::IndexedGraph& /*ig*/,
+                          const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Softmax");
+
+  // Setting by default to 1 since MXNet doesn't provide such an attribute for softmax in its
+  // node params. This attribute is only relevant when the input is coerced to 2D, and in that
+  // case dimension 0 is assumed to be the batch dimension.
+  AttributeProto* const axis = node_proto->add_attribute();
+  axis->set_name("axis");
+  axis->set_type(AttributeProto::INT);
+  axis->set_i(1);
+}
+
+void ConvertFlatten(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
+                    const nnvm::IndexedGraph& /*ig*/,
+                    const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Flatten");
+
+  // Setting by default to 1 since MXNet doesn't provide such an attribute for Flatten in its
+  // node params. This attribute is only relevant when the input is coerced to 2D, and in that
+  // case dimension 0 is assumed to be the batch dimension.
+  AttributeProto* const axis = node_proto->add_attribute();
+  axis->set_name("axis");
+  axis->set_type(AttributeProto::INT);
+  axis->set_i(1);
+}
+
+void ConvertBatchNorm(NodeProto* node_proto, const NodeAttrs& attrs,
+                      const nnvm::IndexedGraph& /*ig*/,
+                      const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("BatchNormalization");
+  const auto& param = nnvm::get<op::BatchNormParam>(attrs.parsed);
+
+  AttributeProto* const epsilon = node_proto->add_attribute();
+  epsilon->set_name("epsilon");
+  epsilon->set_type(AttributeProto::FLOAT);
+  epsilon->set_f(static_cast<float>(param.eps));
+
+  AttributeProto* const is_test = node_proto->add_attribute();
+  is_test->set_name("is_test");
+  is_test->set_type(AttributeProto::INT);
+  is_test->set_i(1);
+
+  AttributeProto* const momentum = node_proto->add_attribute();
+  momentum->set_name("momentum");
+  momentum->set_type(AttributeProto::FLOAT);
+  momentum->set_f(param.momentum);
+
+  AttributeProto* const spatial = node_proto->add_attribute();
+  spatial->set_name("spatial");
+  spatial->set_type(AttributeProto::INT);
+  spatial->set_i(1);
+
+  AttributeProto* const consumed = node_proto->add_attribute();
+  consumed->set_name("consumed_inputs");
+  consumed->set_type(AttributeProto::INTS);
+
+  for (int i = 0; i < 5; i++) {
+    int val = (i < 3) ? 0 : 1;
+    consumed->add_ints(static_cast<int64>(val));
+  }
+}
+
+void ConvertElementwiseAdd(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
+                           const nnvm::IndexedGraph& /*ig*/,
+                           const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Add");
+  AttributeProto* const axis = node_proto->add_attribute();
+  axis->set_name("axis");
+  axis->set_type(AttributeProto::INT);
+  axis->set_i(1);
+
+  AttributeProto* const broadcast = node_proto->add_attribute();
+  broadcast->set_name("broadcast");
+  broadcast->set_type(AttributeProto::INT);
+  broadcast->set_i(0);  // 1
+}
+
+std::unordered_map<std::string, TShape> GetPlaceholderShapes(
+    const ShapeVector& shape_inputs, const nnvm::IndexedGraph& ig) {
+  std::unordered_map<std::string, TShape> placeholder_shapes;
+  for (uint32_t i = 0; i < shape_inputs.size(); ++i) {
+    std::string name = ig[ig.input_nodes()[i]].source->attrs.name;
+    TShape shp = shape_inputs[i];
+    if (shp.ndim() > 0) {
+      placeholder_shapes.emplace(name, shp);
+    }
+  }
+  return placeholder_shapes;
+}
+
+std::unordered_map<std::string, uint32_t> GetOutputLookup(
+    const nnvm::IndexedGraph& ig) {
+  std::unordered_map<std::string, uint32_t> output_lookup;
+  const std::vector<nnvm::IndexedGraph::NodeEntry>& graph_outputs =
+      ig.outputs();
+  for (uint32_t i = 0; i < graph_outputs.size(); ++i) {
+    const uint32_t id = graph_outputs[i].node_id;
+    const IndexedGraph::Node ig_node = ig[id];
+    const nnvm::Node* const source = ig_node.source;
+    const std::string name = source->attrs.name;
+    output_lookup.emplace(name, i);
+  }
+  return output_lookup;
+}
+
+void ConvertPlaceholder(
+    const std::string& node_name,
+    const std::unordered_map<std::string, TShape>& placeholder_shapes,
+    GraphProto* const graph_proto) {
+  auto val_info_proto = graph_proto->add_input();
+  auto type_proto = val_info_proto->mutable_type()->mutable_tensor_type();
+  auto shape_proto = type_proto->mutable_shape();
+
+  val_info_proto->set_name(node_name);
+  // Will support fp16, etc. in the near future
+  type_proto->set_elem_type(TensorProto_DataType_FLOAT);
+  auto entry_shape = placeholder_shapes.find(node_name)->second;
+
+  for (const auto& elem : entry_shape) {
+    TensorShapeProto_Dimension* const tsp_dim = shape_proto->add_dim();
+    tsp_dim->set_dim_value(static_cast<int64>(elem));
+  }
+}
+
+void ConvertConstant(
+    GraphProto* const graph_proto, const std::string& node_name,
+    std::unordered_map<std::string, NDArray>* const shared_buffer) {
+  NodeProto* const node_proto = graph_proto->add_node();
+  node_proto->set_name(node_name);
+  node_proto->add_output(node_name);
+  node_proto->set_op_type("Constant");
+
+  const NDArray nd = shared_buffer->find(node_name)->second;
+  const TBlob& blob = nd.data();
+  const TShape shape = blob.shape_;
+  const int32_t size = shape.Size();
+
+  std::shared_ptr<float> shared_data_ptr(new float[size]);
+  float* const data_ptr = shared_data_ptr.get();
+  nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
+
+  AttributeProto* const tensor_attr = node_proto->add_attribute();
+  tensor_attr->set_name("value");
+  tensor_attr->set_type(AttributeProto::TENSOR);
+
+  TensorProto* const tensor_proto = tensor_attr->mutable_t();
+  tensor_proto->set_data_type(TensorProto_DataType_FLOAT);
+  for (auto& dim : shape) {
+    tensor_proto->add_dims(static_cast<int64>(dim));
+  }
+
+  for (int blob_idx = 0; blob_idx < size; ++blob_idx) {
+    tensor_proto->add_float_data(data_ptr[blob_idx]);
+  }
+}
+
+void ConvertOutput(
+    op::tensorrt::InferenceMap_t* const trt_output_map,
+    GraphProto* const graph_proto,
+    const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
+    const std::string& node_name, const nnvm::Graph& g,
+    const StorageTypeVector& storage_types, const DTypeVector& dtypes) {
+  const nnvm::IndexedGraph& ig = g.indexed_graph();
+  uint32_t out_idx = ig.entry_id(ig.outputs()[out_iter->second]);
+  TShape out_shape = g.GetAttr<nnvm::ShapeVector>("shape")[out_idx];
+  int storage_type = storage_types[out_idx];
+  int dtype = dtypes[out_idx];
+
+  // This should work with fp16 as well
+  op::tensorrt::InferenceTuple_t out_tuple{out_iter->second, out_shape, storage_type,
+                                      dtype};
+
+  trt_output_map->emplace(node_name, out_tuple);
+
+  auto graph_out = graph_proto->add_output();
+  auto tensor_type = graph_out->mutable_type()->mutable_tensor_type();
+  auto tensor_shape_proto = tensor_type->mutable_shape();
+  graph_out->set_name(node_name);
+
+  // Also support fp16.
+  tensor_type->set_elem_type(TensorProto_DataType_FLOAT);
+
+  for (int64_t dim_shp : out_shape) {
+    TensorShapeProto_Dimension* const tsp_dim = tensor_shape_proto->add_dim();
+    tsp_dim->set_dim_value(static_cast<int64>(dim_shp));
+  }
+}
+
+}  // namespace nnvm_to_onnx
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/contrib/tensorrt-inl.h b/src/operator/contrib/tensorrt-inl.h
new file mode 100644
index 000000000000..be335ab1208f
--- /dev/null
+++ b/src/operator/contrib/tensorrt-inl.h
@@ -0,0 +1,113 @@
+#ifndef MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
+#define MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file tensorrt-inl.h
+ * \brief TensorRT Operator
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
+#include <dmlc/serializer.h>
+#include <dmlc/parameter.h>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass_functions.h>
+
+#include <NvInfer.h>
+#include <onnx/onnx.pb.h>
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <vector>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <string>
+
+#include "../operator_common.h"
+#include "../../common/utils.h"
+#include "../../common/serialization.h"
+#include "../../executor/exec_pass.h"
+#include "../../executor/graph_executor.h"
+#include "../../executor/onnx_to_tensorrt.h"
+
+namespace mxnet {
+namespace op {
+
+using namespace nnvm;
+using namespace ::onnx;
+using int64 = ::google::protobuf::int64;
+
+namespace tensorrt {
+  enum class TypeIO { Inputs = 0, Outputs = 1 };
+  using NameToIdx_t = std::map<std::string, int32_t>;
+  using InferenceTuple_t = std::tuple<uint32_t, TShape, int, int>;
+  using InferenceMap_t = std::map<std::string, InferenceTuple_t>;
+}  // namespace tensorrt
+
+using trt_name_to_idx = std::map<std::string, uint32_t>;
+
+struct TRTParam : public dmlc::Parameter<TRTParam> {
+  std::string serialized_onnx_graph;
+  std::string serialized_input_map;
+  std::string serialized_output_map;
+  tensorrt::NameToIdx_t input_map;
+  tensorrt::InferenceMap_t output_map;
+  ::onnx::ModelProto onnx_pb_graph;
+
+  TRTParam() {}
+
+  TRTParam(const ::onnx::ModelProto& onnx_graph,
+           const tensorrt::InferenceMap_t& input_map,
+           const tensorrt::NameToIdx_t& output_map) {
+    common::Serialize(input_map, &serialized_input_map);
+    common::Serialize(output_map, &serialized_output_map);
+    onnx_graph.SerializeToString(&serialized_onnx_graph);
+  }
+
+DMLC_DECLARE_PARAMETER(TRTParam) {
+    DMLC_DECLARE_FIELD(serialized_onnx_graph)
+    .describe("Serialized ONNX graph");
+    DMLC_DECLARE_FIELD(serialized_input_map)
+    .describe("Map from inputs to topological order as input.");
+    DMLC_DECLARE_FIELD(serialized_output_map)
+    .describe("Map from outputs to order in g.outputs.");
+  }
+};
+
+struct TRTEngineParam {
+  nvinfer1::IExecutionContext* trt_executor;
+  std::vector<std::pair<uint32_t, tensorrt::TypeIO> > binding_map;
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
diff --git a/src/operator/contrib/tensorrt.cc b/src/operator/contrib/tensorrt.cc
new file mode 100644
index 000000000000..619fe1e2b8f4
--- /dev/null
+++ b/src/operator/contrib/tensorrt.cc
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file trt.cc
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include "./tensorrt-inl.h"
+
+#include <mxnet/base.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass_functions.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "../../common/serialization.h"
+#include "../../common/utils.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(TRTParam);
+
+OpStatePtr GetPtrMapping(nvinfer1::ICudaEngine* trt_engine,
+                         tensorrt::NameToIdx_t input_map,
+                         tensorrt::NameToIdx_t output_map) {
+  TRTEngineParam param;
+  for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
+    const std::string& binding_name = trt_engine->getBindingName(b);
+    if (trt_engine->bindingIsInput(b)) {
+      param.binding_map.emplace_back(input_map[binding_name],
+                                     tensorrt::TypeIO::Inputs);
+    } else {
+      param.binding_map.emplace_back(output_map[binding_name],
+                                     tensorrt::TypeIO::Outputs);
+    }
+  }
+  param.trt_executor = trt_engine->createExecutionContext();
+  return OpStatePtr::Create<TRTEngineParam>(param);
+}
+
+OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
+                          const std::vector<TShape>& /*ishape*/,
+                          const std::vector<int>& /*itype*/) {
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+
+  ::onnx::ModelProto model_proto;
+  bool success = model_proto.ParseFromString(node_param.serialized_onnx_graph);
+  if (!success) {
+    LOG(FATAL) << "Problems parsing serialized ONNX model.";
+  }
+  auto graph = model_proto.graph();
+  auto first_input_type = graph.input(0).type().tensor_type();
+  auto dim_value = first_input_type.shape().dim(0).dim_value();
+  auto batch_size = static_cast<int32_t >(dim_value);
+  // Need to set up max workspace size based on device properties
+  nvinfer1::ICudaEngine* const trt_engine = ::onnx_to_tensorrt::onnxToTrtCtx(
+      node_param.serialized_onnx_graph, batch_size, 1 << 30);
+
+  tensorrt::NameToIdx_t output_map;
+  for (auto& el : node_param.output_map) {
+    output_map[el.first] = std::get<0>(el.second);
+  }
+  return GetPtrMapping(trt_engine, node_param.input_map, output_map);
+}
+
+void TRTParamParser(nnvm::NodeAttrs* attrs) {
+  TRTParam param_;
+
+  try {
+    param_.Init(attrs->dict);
+    common::Deserialize(&param_.input_map, param_.serialized_input_map);
+    common::Deserialize(&param_.output_map, param_.serialized_output_map);
+    param_.onnx_pb_graph.ParseFromString(param_.serialized_onnx_graph);
+  } catch (const dmlc::ParamError& e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto& k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+
+  attrs->parsed = std::move(param_);
+}
+
+inline bool TRTInferShape(const NodeAttrs& attrs, std::vector<TShape>* /*in_shape*/,
+                          std::vector<TShape>* out_shape) {
+  const auto &node_param = nnvm::get<TRTParam>(attrs.parsed);
+  for (auto& el : node_param.output_map) {
+    (*out_shape)[std::get<0>(el.second)] = std::get<1>(el.second);
+  }
+  return true;
+}
+
+inline bool TRTInferStorageType(const NodeAttrs& /*attrs*/, const int /*dev_mask*/,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int>* /*in_storage_type*/,
+                                std::vector<int>* out_storage_type) {
+  return storage_type_assign(out_storage_type, mxnet::kDefaultStorage,
+                             dispatch_mode, DispatchMode::kFCompute);
+}
+
+inline bool TRTInferType(const NodeAttrs& attrs, std::vector<int>* /*in_dtype*/,
+                         std::vector<int>* out_dtype) {
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  for (auto& el : node_param.output_map) {
+    (*out_dtype)[std::get<0>(el.second)] = std::get<3>(el.second);
+  }
+  return true;
+}
+
+inline std::vector<std::string> TRTListInputNames(const NodeAttrs& attrs) {
+  std::vector<std::string> output;
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  output.resize(node_param.input_map.size());
+  for (auto& el : node_param.input_map) {
+    output[el.second] = el.first;
+  }
+  return output;
+}
+
+inline std::vector<std::string> TRTListOutputNames(const NodeAttrs& attrs) {
+  std::vector<std::string> output;
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  output.resize(node_param.output_map.size());
+  for (auto& el : node_param.output_map) {
+    output[std::get<0>(el.second)] = el.first;
+  }
+  return output;
+}
+
+NNVM_REGISTER_OP(_trt_op)
+    .describe(R"code(TRT operation (one engine)
+)code" ADD_FILELINE)
+    .set_num_inputs([](const NodeAttrs& attrs) {
+      const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+      return node_param.input_map.size();
+    })
+    .set_num_outputs([](const NodeAttrs& attrs) {
+      const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+      return node_param.output_map.size();
+    })
+    .set_attr_parser(TRTParamParser)
+    .set_attr<nnvm::FInferShape>("FInferShape", TRTInferShape)
+    .set_attr<nnvm::FInferType>("FInferType", TRTInferType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames", TRTListInputNames)
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames", TRTListOutputNames)
+    .set_attr<FCreateOpState>("FCreateOpState", TRTCreateState)
+    .set_attr<FInferStorageType>("FInferStorageType", TRTInferStorageType);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/contrib/tensorrt.cu b/src/operator/contrib/tensorrt.cu
new file mode 100644
index 000000000000..2fe8727b73e4
--- /dev/null
+++ b/src/operator/contrib/tensorrt.cu
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file trt.cu
+ * \brief TensorRT GPU operation
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include "./tensorrt-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#define CHECK_CUDART(x) do { \
+  cudaError_t res = (x); \
+  if (res != cudaSuccess) { \
+    fprintf(stderr, "CUDART: %s = %d (%s) at (%s:%d)\n", \
+      #x, res, cudaGetErrorString(res), __FILE__, __LINE__); \
+    exit(1); \
+  } \
+} while (0)
+
+void TRTCompute(const OpStatePtr& state, const OpContext& ctx,
+                     const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+                     const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  Stream<gpu>* s = ctx.get_stream<gpu>();
+  cudaStream_t cuda_s = Stream<gpu>::GetStream(s);
+  const auto& param = state.get_state<TRTEngineParam>();
+  std::vector<void*> bindings;
+  bindings.reserve(param.binding_map.size());
+  for (auto& p : param.binding_map) {
+    if (p.second == tensorrt::TypeIO::Inputs) {
+      bindings.emplace_back(inputs[p.first].dptr_);
+    } else {
+      bindings.emplace_back(outputs[p.first].dptr_);
+    }
+  }
+
+  const int batch_size = static_cast<int>(inputs[0].shape_[0]);
+  param.trt_executor->enqueue(batch_size, bindings.data(), cuda_s, nullptr);
+  CHECK_CUDART(cudaStreamSynchronize(cuda_s));
+}
+
+NNVM_REGISTER_OP(_trt_op)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", TRTCompute);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 7c1beccb0288..d6b6703ddd58 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -1225,6 +1225,9 @@ static bool BackwardCondStorageType(const nnvm::NodeAttrs& attrs,
     CHECK(sync_in_in(input_locs, out_attrs, &subg_out_attrs, is_udf));
     return ret;
   };
+  for (const dim_t &cond_in : params.cond_input_locs) {
+    (*out_attrs)[cond_in] = kDefaultStorage;
+  }
   bool succ_0 = sub_pass(attrs.subgraphs[1], params.then_input_locs);
   bool succ_1 = sub_pass(attrs.subgraphs[2], params.else_input_locs);
   return succ_0 && succ_1;
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 20aabc8ae32f..1c4f48b32ed2 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -47,7 +47,7 @@ namespace op {
 namespace leakyrelu {
 enum LeakyReLUOpInputs {kData, kGamma};
 enum LeakyReLUOpOutputs {kOut, kMask};
-enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU};
+enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU, kSELU};
 enum LeakyReLUOpResource {kRandom};
 }  // namespace leakyrelu
 
@@ -63,6 +63,7 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
     .add_enum("leaky", leakyrelu::kLeakyReLU)
     .add_enum("prelu", leakyrelu::kPReLU)
     .add_enum("elu", leakyrelu::kELU)
+    .add_enum("selu", leakyrelu::kSELU)
     .describe("Activation function to be applied.");
     DMLC_DECLARE_FIELD(slope).set_default(0.25f)
     .describe("Init slope for the activation. (For leaky and elu only)");
@@ -182,6 +183,13 @@ class LeakyReLUOp : public Operator {
         });
         break;
       }
+      case leakyrelu::kSELU: {
+        MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::selu, Req>, xpu>::Launch(
+            s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_);
+        });
+        break;
+      }
       default:
         LOG(FATAL) << "Not implmented";
     }
@@ -270,6 +278,15 @@ class LeakyReLUOp : public Operator {
         });
         break;
       }
+      case leakyrelu::kSELU: {
+        MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<
+            mxnet_op::backward_grad_tuned<mshadow_op::selu_grad>, Req>, xpu>::Launch(
+              s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
+              output.dptr_);
+        });
+        break;
+      }
       default:
         LOG(FATAL) << "Not implmented";
     }
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 99b6ba362f75..4bb24237b8ed 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -54,6 +54,8 @@ when the input is negative and has a slope of one when input is positive.
 The following modified ReLU Activation functions are supported:
 
 - *elu*: Exponential Linear Unit. `y = x > 0 ? x : slope * (exp(x)-1)`
+- *selu*: Scaled Exponential Linear Unit. `y = lambda * (x > 0 ? x : alpha * (exp(x) - 1))` where
+  *lambda = 1.0507009873554804934193349852946* and *alpha = 1.6732632423543772848170429916717*.
 - *leaky*: Leaky ReLU. `y = x > 0 ? x : slope * x`
 - *prelu*: Parametric ReLU. This is same as *leaky* except that `slope` is learnt during training.
 - *rrelu*: Randomized ReLU. same as *leaky* but the `slope` is uniformly and randomly chosen from
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 7a2032df7580..339719375fdd 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -42,8 +42,12 @@ namespace mshadow_op {
 
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
+__constant__ const float SELU_ALPHA = 1.6732632423543772848170429916717;
+__constant__ const float SELU_LAMBDA = 1.0507009873554804934193349852946;
 #else
 const float PI = 3.14159265358979323846;
+const float SELU_ALPHA = 1.6732632423543772848170429916717;
+const float SELU_LAMBDA = 1.0507009873554804934193349852946;
 using std::isnan;
 #endif
 using std::enable_if;
@@ -126,6 +130,12 @@ MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));
 
 MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));
 
+MXNET_UNARY_MATH_OP_NC(selu, DType(SELU_LAMBDA) *
+                         (a > DType(0) ? a : DType(math::id(SELU_ALPHA) * math::expm1(a))));
+
+MXNET_UNARY_MATH_OP_NC(selu_grad,
+                       DType(SELU_LAMBDA) * (a > DType(0) ? DType(1) : DType(SELU_ALPHA + a)));
+
 MXNET_BINARY_MATH_OP_NC(prelu_grad, a > DType(0) ? DType(0) : a);
 
 MXNET_BINARY_MATH_OP_NC(xelu, a > DType(0) ? a :
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index d40abaf1fd66..a5f384ec44a8 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -103,7 +103,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     .add_enum("NDHWC", mshadow::kNDHWC)
     .set_default(dmlc::optional<int>())
     .describe("Set layout for input, output and weight. Empty for\n    "
-              "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
+              "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."
+              "NHWC and NDHWC are only supported on GPU.");
   }
   // Adjusts kernel size for effects of dilation in the dimension `dim`.
   index_t DilatedKernelSize(int dim) const {
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index ef70ccd6ec1e..e87962363ff5 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -426,7 +426,7 @@ then we have::
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 
 The default data ``layout`` is *NCHW*, namely *(batch_size, channel, height,
-width)*. We can choose other layouts such as *NHWC*.
+width)*. We can choose other layouts such as *NWC*.
 
 If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
 evenly into *g* parts along the channel axis, and also evenly split ``weight``
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 827c89faad1f..acdd64976651 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -690,6 +690,7 @@ class CuDNNConvolutionOp {
       const int kMaxAlgos = 10;
       int nalgo = kMaxAlgos;
       int i = 0;
+      size_t min_memory_needs = 0;
       // Forward Algorithm Find/Get, v6 and earlier
       if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
         // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
@@ -720,10 +721,16 @@ class CuDNNConvolutionOp {
         while (i < nalgo
                && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == conv::kLimited
-                       && fwd_algo[i].memory > workspace_byte)))
+                       && fwd_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs =
+            (i == 0) ? fwd_algo[i].memory : std::min(min_memory_needs, fwd_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a forward convolution algorithm.";
+          LOG(FATAL) << nalgo << " forward algorithms with minimum memory requirement "
+                     << min_memory_needs << " bytes have been tried. Workspace size is set to "
+                     << workspace_byte << " bytes, please consider reducing the batch/model size, "
+                     << "or increasing workspace size.";
         } else {
           forward_algo_.Set(fwd_algo[i].algo, false);
         }
@@ -754,10 +761,17 @@ class CuDNNConvolutionOp {
         while (i < nalgo
                && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == conv::kLimited
-                       && bwd_filter_algo[i].memory > workspace_byte)))
+                       && bwd_filter_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs = (i == 0) ?
+                             bwd_filter_algo[i].memory :
+                             std::min(min_memory_needs, bwd_filter_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward filter convolution algorithm.";
+          LOG(FATAL) << nalgo << " backward filter algorithms with minimum memory requirement "
+                     << min_memory_needs << " bytes have been tried. Workspace size is set to "
+                     << workspace_byte << " bytes, please consider reducing the batch/model size, "
+                     << "or increasing workspace size.";
         } else {
           back_algo_w_.Set(bwd_filter_algo[i].algo, false);
         }
@@ -788,10 +802,17 @@ class CuDNNConvolutionOp {
         while (i < nalgo
                && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == conv::kLimited
-                       && bwd_data_algo[i].memory > workspace_byte)))
+                       && bwd_data_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs = (i == 0) ?
+                             bwd_data_algo[i].memory :
+                             std::min(min_memory_needs, bwd_data_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward data convolution algorithm.";
+          LOG(FATAL) << nalgo << " backward data algorithms with minimum memory requirement "
+                     << min_memory_needs << " bytes have been tried. Workspace size is set to "
+                     << workspace_byte << " bytes, please consider reducing the batch/model size, "
+                     << "or increasing workspace size.";
         } else {
           back_algo_.Set(bwd_data_algo[i].algo, false);
         }
@@ -846,7 +867,9 @@ class CuDNNConvolutionOp {
       }
     }
     auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
-    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm.";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm. "
+               << " with workspace size of " << workspace_byte << " bytes,"
+               << " please consider reducing batch/model size or increasing the workspace size";
   }
 
   void GetTempSize(const OpContext& ctx) {
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index f1b40cce27f1..041bea66f7bf 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -623,6 +623,7 @@ class CuDNNDeconvolutionOp {
       const int kMaxAlgos = 10;
       int nalgo = kMaxAlgos;
       int i = 0;
+      size_t min_memory_needs = 0;
       // Forward Algorithm Find/Get, v6 and earlier
       if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
         // In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
@@ -653,11 +654,19 @@ class CuDNNDeconvolutionOp {
         while (i < nalgo
                && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == deconv::kLimited
-                       && fwd_algo[i].memory > workspace_byte)))
+                       && fwd_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs = (i == 0) ?
+                             fwd_algo[i].memory :
+                             std::min(min_memory_needs, fwd_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a 'forward' convolution algorithm " <<
-                     "(for use in deconvolution operator backprop-to-data).";
+          LOG(FATAL) << nalgo << " forward algorithms"
+                     << " (for use in deconvolution operator backprop-to-data)"
+                     << " with minimum memory requirement " << min_memory_needs
+                     << " bytes have been tried. Workspace size is set to " << workspace_byte
+                     << " bytes, please consider reducing the batch/model size,"
+                     << " or increasing workspace size.";
         } else {
           forward_algo_.Set(fwd_algo[i].algo, false);
         }
@@ -688,11 +697,19 @@ class CuDNNDeconvolutionOp {
         while (i < nalgo
                && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == deconv::kLimited
-                       && bwd_filter_algo[i].memory > workspace_byte)))
+                       && bwd_filter_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs = (i == 0) ?
+                             bwd_filter_algo[i].memory :
+                             std::min(min_memory_needs, bwd_filter_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward filter convolution algorithm " <<
-                     "(for use in deconvolution operator backprop-to-filter).";
+          LOG(FATAL) << nalgo << " backward filter algorithms"
+                     << " (for use in deconvolution operator backprop-to-filter)"
+                     << " with minimum memory requirement " << min_memory_needs
+                     << " bytes have been tried. Workspace size is set to " << workspace_byte
+                     << " bytes, please consider reducing the batch/model size,"
+                     << " or increasing workspace size.";
         } else {
           back_algo_w_.Set(bwd_filter_algo[i].algo, false);
         }
@@ -723,11 +740,19 @@ class CuDNNDeconvolutionOp {
         while (i < nalgo
                && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
                    || (param_.cudnn_tune.value() == deconv::kLimited
-                       && bwd_data_algo[i].memory > workspace_byte)))
+                       && bwd_data_algo[i].memory > workspace_byte))) {
           ++i;
+          min_memory_needs = (i == 0) ?
+                             bwd_data_algo[i].memory :
+                             std::min(min_memory_needs, bwd_data_algo[i].memory);
+        }
         if (i == nalgo) {
-          LOG(FATAL) << "Failed to find a backward data convolution algorithm." <<
-                     "(for use in deconvolution operator forward inference).";
+          LOG(FATAL) << nalgo << " backward data algorithms"
+                     << " (for use in deconvolution operator forward inference) with"
+                     << " minimum memory requirement " << min_memory_needs
+                     << " bytes have been tried. Workspace size is set to " << workspace_byte
+                     << " bytes, please consider reducing the batch/model size,"
+                     << " or increasing workspace size.";
         } else {
           back_algo_.Set(bwd_data_algo[i].algo, false);
         }
@@ -788,7 +813,9 @@ class CuDNNDeconvolutionOp {
       }
     }
     auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
-    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm.";
+    LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm"
+               << " with workspace size of " << workspace_byte << " bytes,"
+               << " please consider reducing batch/model size or increasing the workspace size";
   }
 
   void GetTempSize(const OpContext& ctx) {
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b41ecf4aa41e..c7ccfb2fb4e8 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -113,7 +113,8 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
       .add_enum("NDHWC", mshadow::kNDHWC)
       .set_default(dmlc::optional<int>())
       .describe("Set layout for input, output and weight. Empty for "
-                "default layout, NCW for 1d, NCHW for 2d and NCDHW for 3d.");
+                "default layout, NCW for 1d, NCHW for 2d and NCDHW for 3d."
+                "NHWC and NDHWC are only supported on GPU.");
   }
 
   template<size_t ndim>
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index d1d84e975290..f720a10c0022 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -250,9 +250,15 @@ The learnable parameters include both ``weight`` and ``bias``.
 
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 
-Note that the operator also supports forward computation with `row_sparse` weight and bias,
-where the length of `weight.indices` and `bias.indices` must be equal to `num_hidden`.
-This could be used for model inference with `row_sparse` weights trained with `SparseEmbedding`.
+.. Note::
+
+    The sparse support for FullyConnected is limited to forward evaluation with `row_sparse`
+    weight and bias, where the length of `weight.indices` and `bias.indices` must be equal
+    to `num_hidden`. This could be useful for model inference with `row_sparse` weights
+    trained with importance sampling or noise contrastive estimation.
+
+    To compute linear transformation with 'csr' sparse data, sparse.dot is recommended instead
+    of sparse.FullyConnected.
 
 )code" ADD_FILELINE)
 .set_num_inputs([](const NodeAttrs& attrs) {
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 6b3d7c818378..30a752340a5b 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -204,6 +204,8 @@ NNVM_REGISTER_OP(_backward_LRN)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradComputeExCPU)
+// Native compute requires norm while MKLDNN does not so cannot be compared in debug mode
+.set_attr<bool>("TExcludeMKLDNNDebug", true)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", LRNGradCompute<cpu>);
 
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 4e4982e96ee5..27c574deae53 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -473,9 +473,11 @@ void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
   auto ctx = inputs_[0].ctx();
   CHECK(!MKLDNNStream::Get()->HasOps());
   for (size_t i = 0; i < inputs_.size(); i++) {
-    inputs.emplace_back(inputs_[i].shape(), ctx,
-                        false, inputs_[i].dtype());
-    auto mem = inputs_[i].GetMKLDNNData();
+    NDArray data = inputs_[i];
+    inputs.emplace_back(data.shape(), ctx, false, data.dtype());
+    if (data.IsMKLDNNData() && data.IsView())
+        data = data.Reorder2Default();
+    auto mem = data.GetMKLDNNData();
     inputs[i].CopyFrom(*mem);
   }
   for (size_t i = 0; i < outputs_.size(); i++) {
@@ -494,6 +496,11 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
                   const std::vector<mxnet::NDArray> &inputs_,
                   const std::vector<mxnet::OpReqType> &req,
                   const std::vector<mxnet::NDArray> &outputs_) {
+  static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
+  if (is_excluded.get(attrs.op, false)) {
+    LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
+    return;
+  }
   std::vector<mxnet::TBlob> in_blobs(inputs.size());
   for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
   std::vector<mxnet::TBlob> out_blobs(outputs.size());
@@ -509,7 +516,7 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
     if (req[i] == kNullOp)
       continue;
     MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, {
-      bool similar = SimilarArray<DType>(outputs[i], outputs_[i], 1e-3, 1e-3);
+      bool similar = SimilarArray<DType>(outputs[i], outputs_[i], 1e-2, 1e-2);
       if (!similar) {
         LOG(ERROR) << attrs.op->name << " fails";
       }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 9b6996d0feb0..2380f0fc21fb 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -377,8 +377,7 @@ We can see that Lp pooling stands between those two, in practice the most common
 
 For each window ``X``, the mathematical expression for Lp pooling is:
 
-..math::
-  f(X) = \sqrt{p}{\sum\limits_{x \in X} x^p}
+:math:`f(X) = \sqrt[p]{\sum_{x}^{X} x^p}`
 
 )code" ADD_FILELINE)
 .set_num_inputs(1)
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 0953cbaf519b..cf5412f98246 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -217,6 +217,8 @@ IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::softsign);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::softsign_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::relu);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::relu_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::selu);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::selu_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tanh);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tanh_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::softrelu);  // NOLINT()
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 5376a0ee9f11..10834868d2b5 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -221,6 +221,9 @@ Graph QuantizeGraph(Graph &&src) {
 
           new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
           mirror_map[e.node.get()] = std::move(dequantize_node);
+        } else if (mirror_node->op() != nullptr
+                   && mirror_node->op()->name == "_contrib_quantize") {
+          new_node->inputs.emplace_back(NodeEntry{mirror_node->inputs[0].node, e.index, e.version});
         } else {
           new_node->inputs.emplace_back(NodeEntry{mirror_node, e.index, e.version});
         }
diff --git a/src/operator/random/unique_sample_op.cc b/src/operator/random/unique_sample_op.cc
new file mode 100644
index 000000000000..49366697ed6e
--- /dev/null
+++ b/src/operator/random/unique_sample_op.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op.cc
+ * \brief CPU Implementation of unique sample op
+ */
+
+#include "./unique_sample_op.h"
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(SampleUniqueZifpianParam);
+
+#define MXNET_OPERATOR_REGISTER_UNIQUE_SAMPLE(name, ParamType)             \
+  NNVM_REGISTER_OP(name)                                                   \
+  .set_num_inputs(0)                                                       \
+  .set_num_outputs(2)                                                      \
+  .set_attr_parser(ParamParser<ParamType>)                                 \
+  .set_attr<FResourceRequest>("FResourceRequest", UniqueSampleResource)    \
+  .add_arguments(ParamType::__FIELDS__())
+
+MXNET_OPERATOR_REGISTER_UNIQUE_SAMPLE(_sample_unique_zipfian,
+                                      SampleUniqueZifpianParam)
+.describe(R"code(Draw random samples from an an approximately log-uniform
+or Zipfian distribution without replacement.
+
+This operation takes a 2-D shape `(batch_size, num_sampled)`,
+and randomly generates *num_sampled* samples from the range of integers [0, range_max)
+for each instance in the batch.
+
+The elements in each instance are drawn without replacement from the base distribution.
+The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
+
+  P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
+
+Additionaly, it also returns the number of trials used to obtain `num_sampled` samples for
+each instance in the batch.
+
+Example::
+
+   samples, trials = _sample_unique_zipfian(750000, shape=(4, 8192))
+   unique(samples[0]) = 8192
+   unique(samples[3]) = 8192
+   trials[0] = 16435
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FInferShape>("FInferShape", SampleUniqueShape<SampleUniqueZifpianParam>)
+.set_attr<nnvm::FInferType>("FInferType", SampleUniqueType<SampleUniqueZifpianParam>)
+.set_attr<FCompute>("FCompute<cpu>", SampleUniqueZifpian);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/random/unique_sample_op.h b/src/operator/random/unique_sample_op.h
new file mode 100644
index 000000000000..2e93b501f1b4
--- /dev/null
+++ b/src/operator/random/unique_sample_op.h
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file sample_op.h
+ * \brief Elementary unique sampling operators
+ */
+#ifndef MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_
+#define MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_
+
+#include <mxnet/operator_util.h>
+#include <mshadow/base.h>
+#include <string>
+#include <vector>
+#include <unordered_set>
+#include <algorithm>
+#include <cmath>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "./sampler.h"
+
+namespace mxnet {
+namespace op {
+
+struct SampleUniqueZifpianParam : public dmlc::Parameter<SampleUniqueZifpianParam> {
+  int range_max;
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(SampleUniqueZifpianParam) {
+    DMLC_DECLARE_FIELD(range_max)
+    .describe("The number of possible classes.");
+    DMLC_DECLARE_FIELD(shape)
+    .set_default(TShape())
+    .describe("2-D shape of the output, where shape[0] is the batch size, and shape[1] "
+              "is the number of candidates to sample for each batch.");
+  }
+};
+
+template<typename ParamType>
+inline bool SampleUniqueShape(const nnvm::NodeAttrs& attrs,
+                              std::vector<TShape> *in_attrs,
+                              std::vector<TShape> *out_attrs) {
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  // output shape is known
+  if ((*out_attrs)[0].ndim() == 2 && param.shape.ndim() == 0) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1((*out_attrs)[0][0]));
+    return true;
+  }
+  CHECK_EQ(param.shape.ndim(), 2U);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1(param.shape[0]));
+  return true;
+}
+
+template<typename ParamType>
+inline bool SampleUniqueType(const nnvm::NodeAttrs& attrs,
+                             std::vector<int> *in_attrs,
+                             std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kInt64);
+  return true;
+}
+
+inline std::vector<ResourceRequest> UniqueSampleResource(const NodeAttrs& attrs) {
+  return {ResourceRequest::kParallelRandom};
+}
+
+/*!
+ * \brief Launch a generic kernel with parallel unique random generator
+ * \tparam gen random generator
+ * \tparam batch_size the batch size
+ * \tparam num_sampled the number of unique samples per batch
+ * \tparam Args Varargs type to eventually pass to the OP::Map() function
+ */
+template<typename GType, typename DType, typename OP, typename ...Args>
+inline static void LaunchUniqueRNG(mshadow::Stream<cpu> *s,
+                                   common::random::RandGenerator<cpu, GType> *gen,
+                                   const int batch_size, const size_t num_sampled,
+                                   std::vector<std::unordered_set<DType>> *results,
+                                   Args... args) {
+  // minimal check to avoid division by zero, below.
+  // if `N` is zero the map operation is a no-op in any case.
+  if (batch_size <= 0 || num_sampled <= 0) return;
+  const int nthread = std::min(batch_size, RandGenerator<cpu>::kNumRandomStates);
+  const int step = (batch_size + nthread - 1) / nthread;
+  Kernel<OP, cpu>::Launch(s, nthread, *gen, batch_size, num_sampled, results, step, args...);
+}
+
+struct UniqueSampleUniformKernel {
+  template<typename GType, typename DType>
+  MSHADOW_XINLINE static void Map(int tid, RandGenerator<cpu, GType> gen,
+                                  const int batch_size, const size_t num_sampled,
+                                  std::vector<std::unordered_set<DType>> *results,
+                                  const int step, const GType log_range_max,
+                                  DType *samples, DType *num_tries) {
+    const int begin = tid * step;
+    const int end = (tid + 1) * step;
+    typename RandGenerator<cpu, GType>::Impl generator(&gen, tid);
+    for (int i = begin; i < end && i < batch_size; i++) {
+      auto &result = results->at(i);
+      const int base = i * num_sampled;
+      DType tries = 0;
+      while (result.size() != num_sampled) {
+        const double x = generator.uniform();
+        const DType value = static_cast<DType>(lround(exp(x * log_range_max)) - 1);
+        // sampling without replacement
+        if (result.find(value) == result.end()) {
+          samples[base + result.size()] = value;
+          result.emplace(value);
+        }
+        tries += 1;
+      }
+      num_tries[i] = tries;
+    }
+  }
+};
+
+inline void SampleUniqueZifpian(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  using DType = int64_t;
+  using GType = double;
+  const SampleUniqueZifpianParam& param = nnvm::get<SampleUniqueZifpianParam>(attrs.parsed);
+  const int batch_size = param.shape[0];
+  const size_t num_sampled = static_cast<size_t>(param.shape[1]);
+  const double log_range_max = log(param.range_max);
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_LE(num_sampled, param.range_max)
+    << "Number of samples cannot exceed the number of possible classes";
+  // rand generator resource and result sets
+  RandGenerator<cpu, GType> *pgen = ctx.requested[0].get_parallel_random<cpu, GType>();
+  std::vector<std::unordered_set<DType>> results(batch_size);
+  for (int i = 0; i < batch_size; i++) {
+    results[i].reserve(num_sampled);
+  }
+
+  DType *num_tries = outputs[1].dptr<DType>();
+  DType *samples = outputs[0].dptr<DType>();
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  LaunchUniqueRNG<GType, DType, UniqueSampleUniformKernel>(s, pgen, batch_size, num_sampled,
+                                                           &results, log_range_max, samples,
+                                                           num_tries);
+}
+
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 0c37a941fb67..e09a6cccddbf 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -476,6 +476,34 @@ void HardSigmoidBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
+  dmlc::optional<int> lhs_begin, rhs_begin, lhs_end, rhs_end;
+  DMLC_DECLARE_PARAMETER(ReshapeLikeParam) {
+    DMLC_DECLARE_FIELD(lhs_begin)
+        .set_default(dmlc::optional<int>())
+        .describe(
+            "Defaults to 0. "
+            "The beginning index along which the lhs dimensions are to be "
+            "reshaped. Supports negative indices.");
+    DMLC_DECLARE_FIELD(lhs_end)
+        .set_default(dmlc::optional<int>())
+        .describe("Defaults to None. "
+                  "The ending index along which the lhs dimensions are to be "
+                  "used for reshaping. Supports negative indices.");
+    DMLC_DECLARE_FIELD(rhs_begin)
+        .set_default(dmlc::optional<int>())
+        .describe("Defaults to 0. "
+                  "The beginning index along which the rhs dimensions are to "
+                  "be used for "
+                  "reshaping. Supports negative indices.");
+    DMLC_DECLARE_FIELD(rhs_end)
+        .set_default(dmlc::optional<int>())
+        .describe("Defaults to None. "
+                  "The ending index along which the rhs dimensions are to be "
+                  "used for reshaping. Supports negative indices.");
+  }
+};
+
 /*! \brief Unary compute */
 #define MXNET_OPERATOR_REGISTER_UNARY(__name$)                      \
   NNVM_REGISTER_OP(__name$)                                         \
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 929bc7426d5f..f7f21f9076a6 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -350,10 +350,109 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .add_argument("lhs", "NDArray-or-Symbol", "First input.")
 .add_argument("rhs", "NDArray-or-Symbol", "Second input.");
 
+void ReshapeLikeRangeCanonicalize(int ndims, const char *side,
+                                  const dmlc::optional<int> &begin,
+                                  const dmlc::optional<int> &end, int *cbegin,
+                                  int *cend) {
+  *cbegin = begin.has_value() ? begin.value() : 0;
+  if (*cbegin < 0)
+    *cbegin += ndims;
+
+  if (!end.has_value()) {
+    *cend = ndims;
+  } else {
+    *cend = end.value();
+    if (*cend < 0) {
+      *cend += ndims;
+    }
+  }
+  CHECK(*cend <= ndims) << "Invalid end for " << side << "_end=" << end
+                        << " as dimension number is " << ndims;
+  CHECK((*cbegin < *cend)) << "Invalid begin, end, get " << side
+                           << "_begin=" << begin << ", " << side
+                           << "_end=" << end;
+
+  CHECK(*cend >= 0) << "Invalid end for " << side << "_end=" << end;
+  CHECK(*cbegin >= 0) << "Invalid begin for " << side << "_begin=" << begin;
+}
+
+void GetReshapeLikeParams(const ReshapeLikeParam &param, const TShape &lshape,
+                          const TShape &rshape, int *lhs_begin, int *lhs_end,
+                          int *rhs_begin, int *rhs_end) {
+  // LHS params
+  ReshapeLikeRangeCanonicalize(lshape.ndim(), "lhs", param.lhs_begin,
+                               param.lhs_end, lhs_begin, lhs_end);
+  // RHS params
+  ReshapeLikeRangeCanonicalize(rshape.ndim(), "rhs", param.rhs_begin,
+                               param.rhs_end, rhs_begin, rhs_end);
+}
 
+bool ReshapeLikeShapeCompute(const nnvm::NodeAttrs &attrs,
+                             std::vector<TShape> *in_attrs,
+                             std::vector<TShape> *out_attrs) {
+  const ReshapeLikeParam &param = nnvm::get<ReshapeLikeParam>(attrs.parsed);
+  const TShape &lshape = (*in_attrs)[0];
+  const TShape &rshape = (*in_attrs)[1];
+  int lhs_begin, lhs_end, rhs_begin, rhs_end;
+  GetReshapeLikeParams(param, lshape, rshape, &lhs_begin, &lhs_end, &rhs_begin,
+                       &rhs_end);
+
+  int lhsrank = static_cast<int>(lshape.ndim());
+  int orank = lhsrank + (rhs_end - rhs_begin) - (lhs_end - lhs_begin);
+  TShape oshape(orank);
+
+  for (int i = 0; i < lhs_begin; ++i)
+    oshape[i] = lshape[i];
+
+  int opos = lhs_begin;
+  for (int i = rhs_begin; i < rhs_end; ++i) {
+    oshape[opos] = rshape[i];
+    opos += 1;
+  }
+
+  for (int i = lhs_end; i < lhsrank; ++i) {
+    oshape[opos] = lshape[i];
+    opos += 1;
+  }
+
+  CHECK_EQ((*in_attrs)[0].Size(), oshape.Size())
+      << "Cannot reshape lhs with shape " << (*in_attrs)[0] << "to new "
+      << "shape " << oshape << " because they have different "
+      << "size.";
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  return true;
+}
+
+DMLC_REGISTER_PARAMETER(ReshapeLikeParam);
 NNVM_REGISTER_OP(reshape_like)
-.describe("Reshape lhs to have the same shape as rhs.")
+.describe(R"code(Reshape some or all dimensions of `lhs` to have the same shape as some or all dimensions of `rhs`.
+
+Returns a **view** of the `lhs` array with a new shape without altering any data.
+
+Example::
+
+  x = [1, 2, 3, 4, 5, 6]
+  y = [[0, -4], [3, 2], [2, 2]]
+  reshape_like(x, y) = [[1, 2], [3, 4], [5, 6]]
+
+More precise control over how dimensions are inherited is achieved by specifying \
+slices over the `lhs` and `rhs` array dimensions. Only the sliced `lhs` dimensions \
+are reshaped to the `rhs` sliced dimensions, with the non-sliced `lhs` dimensions staying the same. 
+
+  Examples::
+
+  - lhs shape = (30,7), rhs shape = (15,2,4), lhs_begin=0, lhs_end=1, rhs_begin=0, rhs_end=2, output shape = (15,2,7)
+  - lhs shape = (3, 5), rhs shape = (1,15,4), lhs_begin=0, lhs_end=2, rhs_begin=1, rhs_end=2, output shape = (15)
+
+Negative indices are supported, and `None` can be used for either `lhs_end` or `rhs_end` to indicate the end of the range.
+
+  Example::
+
+  - lhs shape = (30, 12), rhs shape = (4, 2, 2, 3), lhs_begin=-1, lhs_end=None, rhs_begin=1, rhs_end=None, output shape = (30, 2, 2, 3)
+
+)code" ADD_FILELINE)
 .set_num_inputs(2)
+.set_attr_parser(ParamParser<ReshapeLikeParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) { return std::vector<std::string>{"lhs", "rhs"}; })
 .set_attr<nnvm::FInplaceOption>(
@@ -365,19 +464,7 @@ NNVM_REGISTER_OP(reshape_like)
 .set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
     [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<nnvm::FInferShape>("FInferShape",
-    [](const nnvm::NodeAttrs& attrs,
-       std::vector<TShape> *in_attrs,
-       std::vector<TShape> *out_attrs) {
-      if ((*in_attrs)[0].ndim()) {
-        CHECK_EQ((*in_attrs)[0].Size(), (*in_attrs)[1].Size())
-            << "Cannot reshape lhs with shape " << (*in_attrs)[0] << "to rhs "
-            << "with shape " << (*in_attrs)[1] << " because they have different "
-            << "size.";
-      }
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[1]);
-      return true;
-    })
+.set_attr<nnvm::FInferShape>("FInferShape", ReshapeLikeShapeCompute)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<nnvm::FGradient>(
     "FGradient",  [](const nnvm::NodePtr& n,
@@ -438,7 +525,8 @@ Example::
     TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
     return out_attrs->at(0) != -1;
   })
-.add_argument("data", "NDArray-or-Symbol", "Input Array.");
+.add_argument("data", "NDArray-or-Symbol", "Input Array.")
+.add_arguments(ReshapeLikeParam::__FIELDS__());
 
 void SizeComputeCPU(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index ef59145bb4a9..b663ef0179df 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -570,6 +570,10 @@ Examples::
   indices = [[1, 1, 0], [0, 1, 0]]
   gather_nd(data, indices) = [2, 3, 0]
 
+  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+  indices = [[0, 1], [1, 0]]
+  gather_nd(data, indices) = [[3, 4], [5, 6]]
+
 )code")
 .set_num_outputs(1)
 .set_num_inputs(2)
@@ -629,6 +633,21 @@ Examples::
   shape = (2, 2)
   scatter_nd(data, indices, shape) = [[0, 0], [2, 3]]
 
+  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+  indices = [[0, 1], [1, 1]]
+  shape = (2, 2, 2, 2)
+  scatter_nd(data, indices, shape) = [[[[0, 0],
+                                        [0, 0]],
+
+                                       [[1, 2],
+                                        [3, 4]]],
+
+                                      [[[0, 0],
+                                        [0, 0]],
+
+                                       [[5, 6],
+                                        [7, 8]]]]
+
 )code")
 .set_num_outputs(1)
 .set_num_inputs(2)
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index edaf9397303a..1daf0a2cb18a 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -1034,8 +1034,8 @@ void TakeOpBackward(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 2U);
-  CHECK_EQ(req[take_::kIdx], kNullOp)
-    << "take layer doesn't support gradient into index";
+  CHECK_NE(req[take_::kIdx], kAddTo)
+    << "take layer doesn't support gradient of req type kAddTo to index";
 
   const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
 
@@ -1052,6 +1052,11 @@ void TakeOpBackward(const nnvm::NodeAttrs& attrs,
       const TShape& arrshape = outputs[0].shape_;
       const TShape& oshape = inputs[0].shape_;
 
+      if (req[take_::kIdx] != kNullOp) {
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
+          s, idxshape.Size(), outputs[take_::kIdx].dptr<IType>());
+      }
+
       const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 0);
 
       int idxndim = idxshape.ndim();
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 105ee8b90db8..a6f638e29321 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -170,11 +170,13 @@ MSHADOW_FORCE_INLINE void TopKSort<cpu>(const Tensor<cpu, 1, real_t>& dat,
   // Use full sort when K is relatively large.
   const bool full_sort(K*8 > N);
   // Batch size.
-  const int M(dat.size(0)/N);
+  const int M(work.size(0)/(sizeof(real_t)*N));
   const int omp_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount());
   #pragma omp parallel for num_threads(omp_threads)
   for (int i = 0; i < M; ++i) {
-    real_t *vals = dat.dptr_;
+    // Tensor `work` stores the flattened source data, while `dat` stores the sorted result.
+    real_t *vals = reinterpret_cast<real_t*>(work.dptr_);
+    real_t *sorted_vals = dat.dptr_+i*N;
     int *indices = ind.dptr_+i*N;
     if (is_ascend) {
       if (full_sort) {
@@ -193,11 +195,9 @@ MSHADOW_FORCE_INLINE void TopKSort<cpu>(const Tensor<cpu, 1, real_t>& dat,
                           [&](const int& i1, const int& i2){ return vals[i1] > vals[i2]; });
       }
     }
-    real_t *buff = reinterpret_cast<real_t*>(work.dptr_)+i*K;
     for (int j = 0; j < K; ++j) {
-      buff[j] = vals[indices[j]];
+      sorted_vals[j] = vals[indices[j]];
     }
-    std::copy(buff, buff+K, &vals[i*N]);
   }
 }
 
@@ -380,16 +380,7 @@ void TopKImpl(RunContext ctx,
   indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
                                 Shape1(src.Size()), s);  // indices in the original matrix
   workspace_curr_ptr += sizeof(int) * src.Size();
-  if (do_transpose) {
-    sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
-  } else {
-    sorted_dat = reshape(dat, Shape1(src.Size()));
-  }
-  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, 0, 1,
-    kWriteTo, indices.dptr_);
 
-  CHECK_EQ(sorted_dat.CheckContiguous(), true);
-  CHECK_EQ(indices.CheckContiguous(), true);
   if (param.ret_typ == topk_enum::kReturnMask) {
     sel_indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
                                       Shape1(batch_size * k), s);
@@ -401,15 +392,49 @@ void TopKImpl(RunContext ctx,
     CHECK_EQ(sel_indices.CheckContiguous(), true);
     CHECK_EQ(mask_val.CheckContiguous(), true);
   }
-  temp_workspace = Tensor<xpu, 1, char>(workspace_curr_ptr, Shape1(temp_size), s);  // temp space
-  workspace_curr_ptr += temp_size;
+
+  if (std::is_same<xpu, cpu>::value) {
+    Tensor<xpu, 1, real_t> flattened_data;
+    if (do_transpose) {
+      flattened_data = Tensor<xpu, 1, real_t>(reinterpret_cast<real_t*>(workspace_curr_ptr),
+                                              Shape1(src.Size()), s);
+      workspace_curr_ptr += sizeof(real_t) * src.Size();
+      flattened_data = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
+      CHECK_EQ(flattened_data.CheckContiguous(), true);
+    } else {
+      flattened_data = src.FlatTo1D<xpu, real_t>(s);
+    }
+    // `temp_workspace` stores the flattened data
+    temp_workspace = Tensor<xpu, 1, char>(reinterpret_cast<char*>(flattened_data.dptr_),
+                                          Shape1(sizeof(real_t)*src.Size()), s);
+    CHECK_EQ(temp_workspace.CheckContiguous(), true);
+  } else {
+    if (do_transpose) {
+      sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
+    } else {
+      sorted_dat = reshape(dat, Shape1(src.Size()));
+    }
+    CHECK_EQ(sorted_dat.CheckContiguous(), true);
+    temp_workspace = Tensor<xpu, 1, char>(workspace_curr_ptr, Shape1(temp_size), s);  // temp space
+    workspace_curr_ptr += temp_size;
+  }
+
+  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, 0, 1,
+    kWriteTo, indices.dptr_);
+  CHECK_EQ(indices.CheckContiguous(), true);
 
   // 2. Perform inplace batch sort.
   // After sorting, each batch in `sorted_dat` will be sorted in the corresponding order
   // up to the k-th element and the `indices` will contain the corresponding index in `sorted_dat`
+  // `temp_workspace` is used to store the flattend source data for CPU device, and it's used as
+  // a temporal buffer for GPU device.
   TopKSort(sorted_dat, indices, temp_workspace, k, element_num, is_ascend, s);
 
   // 3. Assign results to the ret blob
+  // When returning indices, only update(modulo) required elements instead of full elements
+  // to avoid redundant calculation.
+  // Cast `ret_indices` from int to real_t could introduce conversion error when the element_num
+  // is large enough.
   if (param.ret_typ == topk_enum::kReturnMask) {
     Tensor<xpu, 2, real_t> ret_mask =
       ret[0].get_with_shape<xpu, 2, real_t>(Shape2(ret[0].Size(), 1), s);
@@ -427,24 +452,25 @@ void TopKImpl(RunContext ctx,
     }
     IndexFill(ret_mask, sel_indices, mask_val);
   } else if (param.ret_typ == topk_enum::kReturnIndices) {
-    indices = F<mshadow_op::mod>(indices, element_num);
     if (do_transpose) {
       Tensor<xpu, 3, real_t> ret_indices = ret[0].FlatTo3D<xpu, real_t>(axis, axis, s);
-      ret_indices = tcast<real_t>(transpose(
-                      slice<2>(inplace_reshape(indices,
-                                               Shape3(ret_indices.shape_[0],
-                                                      ret_indices.shape_[2],
-                                                      element_num)),
-                               0, k),
-                      Shape3(0, 2, 1)));
+      ret_indices = tcast<real_t>(F<mshadow_op::mod>(
+                      transpose(slice<2>(inplace_reshape(indices,
+                                                         Shape3(ret_indices.shape_[0],
+                                                                ret_indices.shape_[2],
+                                                                element_num)),
+                                         0, k),
+                                Shape3(0, 2, 1)),
+                      element_num));
     } else {
       Tensor<xpu, 2, real_t> ret_indices =
         ret[0].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
-      ret_indices = tcast<real_t>(slice<1>(
-                      inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k));
+      ret_indices = tcast<real_t>(F<mshadow_op::mod>(
+                      slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)),
+                               0, k),
+                      element_num));
     }
   } else {
-    indices = F<mshadow_op::mod>(indices, element_num);
     if (do_transpose) {
       Tensor<xpu, 3, real_t> ret_value = ret[0].FlatTo3D<xpu, real_t>(axis, axis, s);
       Tensor<xpu, 3, real_t> ret_indices = ret[1].FlatTo3D<xpu, real_t>(axis, axis, s);
@@ -453,21 +479,24 @@ void TopKImpl(RunContext ctx,
                                     Shape3(ret_value.shape_[0], ret_value.shape_[2], element_num)),
                             0, k),
                    Shape3(0, 2, 1));
-      ret_indices = tcast<real_t>(transpose(
-                      slice<2>(inplace_reshape(indices,
-                                               Shape3(ret_indices.shape_[0],
-                                                      ret_indices.shape_[2],
-                                                      element_num)),
-                               0, k),
-                      Shape3(0, 2, 1)));
+      ret_indices = tcast<real_t>(F<mshadow_op::mod>(
+                      transpose(slice<2>(inplace_reshape(indices,
+                                                         Shape3(ret_indices.shape_[0],
+                                                         ret_indices.shape_[2],
+                                                         element_num)),
+                                         0, k),
+                                Shape3(0, 2, 1)),
+                      element_num));
     } else {
       Tensor<xpu, 2, real_t> ret_value =
         ret[0].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
       Tensor<xpu, 2, real_t> ret_indices =
         ret[1].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
       ret_value = slice<1>(inplace_reshape(sorted_dat, Shape2(batch_size, element_num)), 0, k);
-      ret_indices = tcast<real_t>(slice<1>(
-                      inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k));
+      ret_indices = tcast<real_t>(F<mshadow_op::mod>(
+                      slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)),
+                               0, k),
+                      element_num));
     }
   }
 }
diff --git a/tests/.gitignore b/tests/.gitignore
index d6459089c245..3e5eed695f0a 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1 +1,2 @@
 *_unittest
+*.gz
diff --git a/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu b/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu
deleted file mode 100644
index de4629fab2e7..000000000000
--- a/tests/ci_build/pip_tests/Dockerfile.in.pip_cpu
+++ /dev/null
@@ -1,4 +0,0 @@
-# -*- mode: dockerfile -*-
-# dockerfile to test pip installation on CPU
-
-FROM ubuntu:16.04
diff --git a/tests/ci_build/pip_tests/Dockerfile.in.pip_cu75 b/tests/ci_build/pip_tests/Dockerfile.in.pip_cu75
deleted file mode 100644
index a41e6a953306..000000000000
--- a/tests/ci_build/pip_tests/Dockerfile.in.pip_cu75
+++ /dev/null
@@ -1,4 +0,0 @@
-# -*- mode: dockerfile -*-
-# dockerfile to test pip installation on GPU with CUDA 7.5 CuDNN 5.1
-
-FROM nvidia/cuda:7.5-cudnn5-devel
diff --git a/tests/ci_build/pip_tests/Dockerfile.in.pip_cu80 b/tests/ci_build/pip_tests/Dockerfile.in.pip_cu80
deleted file mode 100644
index 714ba3cf325c..000000000000
--- a/tests/ci_build/pip_tests/Dockerfile.in.pip_cu80
+++ /dev/null
@@ -1,4 +0,0 @@
-# -*- mode: dockerfile -*-
-# dockerfile to test pip installation on GPU with CUDA 8.0 CuDNN 5.1
-
-FROM nvidia/cuda:8.0-cudnn5-devel
diff --git a/tests/ci_build/pip_tests/Dockerfile.pip_dependencies b/tests/ci_build/pip_tests/Dockerfile.pip_dependencies
deleted file mode 100644
index 0698633ac57c..000000000000
--- a/tests/ci_build/pip_tests/Dockerfile.pip_dependencies
+++ /dev/null
@@ -1,14 +0,0 @@
-# -*- mode: dockerfile -*-
-# part of the dockerfile to test pip installations
-
-# add repo to install different Python versions
-RUN apt-get update && apt-get install -y software-properties-common
-RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update
-RUN apt-get install -y python python2.7 python3.4 python3.5 python3.6
-
-# install other dependencies
-RUN apt-get install -y wget git unzip gcc
-RUN apt-get install -y libgfortran3
-
-# install virtualenv
-RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && pip install virtualenv && rm -rf get-pip.py
diff --git a/tests/cpp/misc/serialization.cc b/tests/cpp/misc/serialization.cc
new file mode 100644
index 000000000000..96f8b6c3a3a7
--- /dev/null
+++ b/tests/cpp/misc/serialization.cc
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <../../../src/common/serialization.h>
+
+using namespace mxnet;
+using namespace std;
+
+/*
+ * Test that used datastruct are properly serialized and deserialized
+ */
+
+TEST(SerializerTest, InputMapCorrect) {
+    std::map<std::string, int32_t> input_map;
+    input_map.emplace("input_0", 2);
+    input_map.emplace("another_input", 0);
+    input_map.emplace("last_input", 1);
+    std::string serialized_data;
+    common::Serialize(input_map, &serialized_data);
+    std::map<std::string, int32_t> deserialized_input_map;
+    common::Deserialize(&deserialized_input_map, serialized_data);
+    ASSERT_EQ(input_map.size(), deserialized_input_map.size());
+    for (auto& p : input_map) {
+        auto it = deserialized_input_map.find(p.first);
+        ASSERT_NE(it, deserialized_input_map.end());
+        ASSERT_EQ(it->second, p.second);
+    }
+}
+
+TEST(SerializerTest, OutputMapCorrect) {
+    std::map<std::string, std::tuple<uint32_t, TShape, int, int> > output_map;
+    output_map.emplace("output_0", std::make_tuple(1, TShape({23, 12, 63, 432}), 0, 1));
+    output_map.emplace("another_output", std::make_tuple(2, TShape({23, 123}), 14, -23));
+    output_map.emplace("last_output", std::make_tuple(0, TShape({0}), -1, 0));
+    std::string serialized_data;
+    common::Serialize(output_map, &serialized_data);
+    std::map<std::string, std::tuple<uint32_t, TShape, int, int> > deserialized_output_map;
+    common::Deserialize(&deserialized_output_map, serialized_data);
+    ASSERT_EQ(output_map.size(), deserialized_output_map.size());
+    for (auto& p : output_map) {
+        auto it = deserialized_output_map.find(p.first);
+        ASSERT_NE(it, deserialized_output_map.end());
+        auto lhs = it->second;
+        auto rhs = p.second;
+        ASSERT_EQ(std::get<0>(lhs), std::get<0>(rhs));
+        ASSERT_EQ(std::get<1>(lhs), std::get<1>(rhs));
+        ASSERT_EQ(std::get<2>(lhs), std::get<2>(rhs));
+        ASSERT_EQ(std::get<3>(lhs), std::get<3>(rhs));
+    }
+}
+
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
index 4f8f6f1f66a0..59bd3a547b72 100644
--- a/tests/cpp/operator/mkldnn.cc
+++ b/tests/cpp/operator/mkldnn.cc
@@ -1308,6 +1308,7 @@ TEST(MKLDNN_BASE, MKLDNNSum) {
     auto input_mem = in_arr.arr.GetMKLDNNData();
     auto input_mem2 = in_arr2.arr.GetMKLDNNData();
     NDArrayAttrs orig_arr(in_arr.arr.Copy(in_arr.arr.ctx()), "In Place Copy");
+    orig_arr.arr.WaitToRead();
     PrintVerifyMsg(orig_arr, in_arr);
     InitMKLDNNArray(&orig_arr.arr, input_mem->get_primitive_desc());
     orig_arr.arr.CopyFrom(*input_mem);
diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile
index 173a33ab4881..35996fbcd32d 100755
--- a/tests/nightly/Jenkinsfile
+++ b/tests/nightly/Jenkinsfile
@@ -15,76 +15,37 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
-
+//
 //This is a Jenkinsfile for nightly tests. The format and some functions have been picked up from the top-level Jenkinsfile
 
-err = null
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a'
 
-// pack libraries for later use
-def pack_lib(name, libs=mx_lib) {
-  sh """
-echo "Packing ${libs} into ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs=mx_lib) {
-  unstash name
-  sh """
-echo "Unpacked ${libs} from ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-}
-
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
-  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
-  command = command.replaceAll('%PLATFORM%', platform)
-  command = command.replaceAll('%FUNCTION_NAME%', function_name)
-  command = command.replaceAll('%SHARED_MEM%', shared_mem)
-
-  sh command
+node('mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
 
-try {
+utils.main_wrapper(
+core_logic: {
   stage('NightlyTests'){
     parallel 'CompilationWarnings: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-compilationTest') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_compilation_warning', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_compilation_warning', false)
         }
       }
     },
     'InstallationGuide: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-Installation-cpu') {
-          init_git()
+          utils.init_git()
           //Some install guide tests are currently diabled and tracked here:
           //1. https://github.com/apache/incubator-mxnet/issues/11369
           //2. https://github.com/apache/incubator-mxnet/issues/11288
-          docker_run('ubuntu_base_cpu', 'nightly_test_installation ubuntu_python_cpu_virtualenv', false)
+          utils.docker_run('ubuntu_base_cpu', 'nightly_test_installation ubuntu_python_cpu_virtualenv', false)
           //docker_run('ubuntu_base_cpu', 'nightly_test_installation ubuntu_python_cpu_pip', false)
           //docker_run('ubuntu_base_cpu', 'nightly_test_installation ubuntu_python_cpu_docker', false)
           //docker_run('ubuntu_base_cpu', 'nightly_test_installation ubuntu_python_cpu_source', false)
@@ -92,90 +53,82 @@ try {
       }
     },
     'InstallationGuide: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/nt-Installation-gpu') {
-          init_git()
+          utils.init_git()
           //Some install guide tests are currently diabled and tracked here:
           //1. https://github.com/apache/incubator-mxnet/issues/11369
           //2. https://github.com/apache/incubator-mxnet/issues/11288
-          docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_virtualenv', true)
+          utils.docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_virtualenv', true)
           //docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_pip', true)
           //docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_docker', true)
-          docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_source', true)
+          utils.docker_run('ubuntu_base_gpu', 'nightly_test_installation ubuntu_python_gpu_source', true)
         }
       }
     },
     'PipTest: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/nt-pipTest') {
-          init_git()
+          utils.init_git()
         }
       }
     },
     'Amalgamation-atlas: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-amalgamation1') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas', false)
         }
       }
     },
     'Amalgamation-atlas-min: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-amalgamation2') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MIN=1', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MIN=1', false)
         }
       }
     },
     'Amalgamation-atlas-mkl: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-amalgamation3') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MSHADOW_USE_MKL=1', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MSHADOW_USE_MKL=1', false)
         }
       }
     },
     'Amalgamation-atlas-cuda: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-amalgamation4') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MSHADOW_USE_CUDA=1', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas MSHADOW_USE_CUDA=1', false)
         }
       }
     },
     'Amalgamation-atlas-openmp: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-amalgamation5') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas DISABLE_OPENMP=0', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_amalgamation USE_BLAS=atlas DISABLE_OPENMP=0', false)
         }
       }
     },
     'MXNetJS: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/nt-mxnetjs') {
-          init_git()
-          docker_run('ubuntu_nightly_cpu', 'nightly_test_javascript', false)
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_javascript', false)
         }
       }
     }
   }
-} catch (caughtError) {
-  node("mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("mxnetlinux-cpu") {
-    // Only send email if nightly test failed
-    if (currentBuild.result == "FAILURE") {
+}
+,
+failure_handler: {
+  // Only send email if nightly test failed
+  if (currentBuild.result == "FAILURE") {
     emailext body: 'Nightly tests for MXNet branch ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
   }
 }
+)
+
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 0b009d28a55a..63914b11cb46 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -15,66 +15,28 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
+//
 //This is a Jenkinsfile for nightly tests. The format and some functions have been picked up from the top-level Jenkinsfile
 
-err = null
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
-// pack libraries for later use
-def pack_lib(name, libs=mx_lib) {
-  sh """
-echo "Packing ${libs} into ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs=mx_lib) {
-  unstash name
-  sh """
-echo "Unpacked ${libs} from ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-}
-
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
-  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
-  command = command.replaceAll('%PLATFORM%', platform)
-  command = command.replaceAll('%FUNCTION_NAME%', function_name)
-  command = command.replaceAll('%SHARED_MEM%', shared_mem)
-
-  sh command
+node('mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
 
-try {
+utils.main_wrapper(
+core_logic: {
   stage('Build') {
     parallel 'GPU: CUDA9.1+cuDNN7': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/build-gpu') {
-          init_git()
+          utils.init_git()
           //sh "ci/build.py --platform ubuntu_build_cuda /work/runtime_functions.sh build_ubuntu_gpu_cuda91_cudnn7"
-          docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
-          pack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
+          utils.pack_lib('gpu', mx_lib)
         }
       }
     }
@@ -82,75 +44,65 @@ try {
 
   stage('NightlyTests'){
     parallel 'ImageClassification: GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/nt-ImageClassificationTest') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_test_image_classification', true)
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_image_classification', true)
         }
       }
     },
     'KVStore_SingleNode: GPU': {
       node('mxnetlinux-gpu-p3-8xlarge') {
         ws('workspace/nt-KVStoreTest') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_test_KVStore_singleNode', true) 
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_KVStore_singleNode', true) 
         }
       }
     },
     'StraightDope: Python2 Single-GPU': {
-      node('mxnetlinux-gpu-p3') {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_single_gpu_tests', true)
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_single_gpu_tests', true)
         }
       }
     },
     'StraightDope: Python2 Multi-GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/straight_dope-multi_gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_multi_gpu_tests', true)
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_multi_gpu_tests', true)
         }
       }
     },
     'StraightDope: Python3 Single-GPU': {
-      node('mxnetlinux-gpu-p3') {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_single_gpu_tests', true)
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_single_gpu_tests', true)
         }
       }
     },
     'StraightDope: Python3 Multi-GPU': {
-      node('mxnetlinux-gpu') {
+      node(NODE_LINUX_GPU) {
         ws('workspace/straight_dope-multi_gpu') {
-          init_git()
-          unpack_lib('gpu', mx_lib)
-          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_multi_gpu_tests', true)
+          utils.init_git()
+          utils.unpack_lib('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_multi_gpu_tests', true)
         }
       }
     }
   }
-} catch (caughtError) {
-  node("mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("mxnetlinux-cpu") {
-    // Only send email if master failed
-    if (currentBuild.result == "FAILURE") {
-		emailext body: 'Nightly tests for MXNet branch ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
+}
+,
+failure_handler: {
+  if (currentBuild.result == "FAILURE") {
+    emailext body: 'Nightly tests for MXNet branch ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
+)
diff --git a/tests/nightly/apache_rat_license_check/README.md b/tests/nightly/apache_rat_license_check/README.md
index 04def9176369..e8578a857224 100755
--- a/tests/nightly/apache_rat_license_check/README.md
+++ b/tests/nightly/apache_rat_license_check/README.md
@@ -14,7 +14,7 @@ The following commands can be used to run a Apache RAT check locally -
 
 Docker based 1-click-method:
 ```
-ci/build.py --platform ubuntu_rat /work/runtime_functions.sh nightly_test_rat_check
+ci/build.py -p ubuntu_rat nightly_test_rat_check
 ```
 
 Manual method:
@@ -25,8 +25,8 @@ sudo apt-get install maven -y #>/dev/null
 #install svn
 sudo apt-get install subversion -y #>/dev/null
 
-#download RAT
-svn co http://svn.apache.org/repos/asf/creadur/rat/trunk/ #>/dev/null
+#download RAT 0.12 version
+svn co http://svn.apache.org/repos/asf/creadur/rat/tags/apache-rat-project-0.12-RC3/ #>/dev/null
 
 #cd into correct directory
 cd trunk
@@ -38,5 +38,5 @@ mvn install #>/dev/null
 cd apache-rat/target
 
 #run Apache RAT check on the src
-java -jar apache-rat-0.13-SNAPSHOT.jar -E <path-to-.rat-excludes-file> -d <path-to-mxnet-source>
+java -jar apache-rat-0.12.jar -E <path-to-.rat-excludes-file> -d <path-to-mxnet-source>
 ```
diff --git a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
index 912b65b9bbc4..782bf74c9ccc 100755
--- a/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
+++ b/tests/nightly/broken_link_checker_test/JenkinsfileForBLC
@@ -15,49 +15,27 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
-
+//
 //This is a Jenkinsfile for the broken link checker test.
 
-err = null
-
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
-  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
-  command = command.replaceAll('%PLATFORM%', platform)
-  command = command.replaceAll('%FUNCTION_NAME%', function_name)
-  command = command.replaceAll('%SHARED_MEM%', shared_mem)
-
-  sh command
+node('mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
 
-try {
+utils.main_wrapper(
+core_logic: {
   stage('BLC'){
     parallel 'BrokenLinkChecker: CPU': {
-      node('mxnetlinux-cpu') {
+      node(NODE_LINUX_CPU) {
         ws('workspace/brokenLinkChecker') {
-          timeout(time: 40, unit: 'MINUTES') {
+          timeout(time: 60, unit: 'MINUTES') {
             try {
-              init_git()
+              utils.init_git()
               sh 'aws s3 cp s3://mxnet-ci-prod-slave-data/url_list.txt  ./tests/nightly/broken_link_checker_test/url_list.txt'
-              docker_run('ubuntu_blc', 'broken_link_checker', false)
+              utils.docker_run('ubuntu_blc', 'broken_link_checker', false)
             } finally {
               sh "echo Storing the new url_list.txt to S3 bucket" 
               sh 'aws s3 cp ./tests/nightly/broken_link_checker_test/url_list.txt s3://mxnet-ci-prod-slave-data/url_list.txt'
@@ -67,26 +45,17 @@ try {
       }
     }
   }
-} catch (caughtError) {
-  node("mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("mxnetlinux-cpu") {
-    // Only send email if nightly test failed
-    if (currentBuild.result == "FAILURE") {
-    	emailext body: '''https://mxnet.incubator.apache.org  broken link test summary:
-            |
-            |Please view the logs at ${BUILD_URL}
-            |
-            |${BUILD_LOG_EXCERPT, start="START - Broken links summary", end="END - Broken links summary"}''',
-        replyTo: '${EMAIL}', subject: '[BROKEN LINK CHECKER FAILED] Run ${BUILD_NUMBER}', to: '${EMAIL}'
-    }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
+}
+,
+failure_handler:
+{
+  if (currentBuild.result == "FAILURE") {
+    emailext body: '''https://mxnet.incubator.apache.org  broken link test summary:
+          |
+          |Please view the logs at ${BUILD_URL}
+          |
+          |${BUILD_LOG_EXCERPT, start="START - Broken links summary", end="END - Broken links summary"}''',
+      replyTo: '${EMAIL}', subject: '[BROKEN LINK CHECKER FAILED] Run ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
+)
diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
index 412d68d56ff3..9158b0486cd4 100644
--- a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
+++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
@@ -15,65 +15,26 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-
-
+//
 //This is a Jenkinsfile for the model backwards compatibility checker. The format and some functions have been picked up from the top-level Jenkinsfile.
 
-err = null
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git submodule update --init --recursive'
-        sh 'git clean -d -f'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-// pack libraries for later use
-def pack_lib(name, libs=mx_lib) {
-  sh """
-echo "Packing ${libs} into ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs=mx_lib) {
-  unstash name
-  sh """
-echo "Unpacked ${libs} from ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-"""
+node('restricted-mxnetlinux-cpu') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
 }
+utils.assign_node_labels(linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
-  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
-  command = command.replaceAll('%PLATFORM%', platform)
-  command = command.replaceAll('%FUNCTION_NAME%', function_name)
-  command = command.replaceAll('%SHARED_MEM%', shared_mem)
-
-  sh command
-}
-
-try {
+utils.main_wrapper(
+core_logic: {
   stage('MBCC Train'){
-    node('restricted-mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/modelBackwardsCompat') {
-        init_git()
+        utils.init_git()
         // Train models on older versions
-        docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false)
+        utils.docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false)
         // upload files to S3 here outside of the docker environment
         sh "./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh"
       }
@@ -81,40 +42,32 @@ try {
   }
 
   stage('MXNet Build'){
-    node('restricted-mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/build-cpu') {
-        init_git()
-        docker_run('ubuntu_cpu','build_ubuntu_cpu', false)
-        pack_lib('cpu', mx_lib)
+        utils.init_git()
+        utils.docker_run('ubuntu_cpu','build_ubuntu_cpu', false)
+        utils.pack_lib('cpu', mx_lib)
       }
     }
   }
 
   stage('MBCC Inference'){
-    node('restricted-mxnetlinux-cpu') {
+    node(NODE_LINUX_CPU) {
       ws('workspace/modelBackwardsCompat') {
-        init_git()
-        unpack_lib('cpu', mx_lib)
+        utils.init_git()
+        utils.unpack_lib('cpu', mx_lib)
         // Perform inference on the latest version of MXNet
-        docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false)
+        utils.docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false)
       }
     }
   }
-} catch (caughtError) {
-  node("restricted-mxnetlinux-cpu") {
-    sh "echo caught ${caughtError}"
-    err = caughtError
-    currentBuild.result = "FAILURE"
-  }
-} finally {
-  node("restricted-mxnetlinux-cpu") {
-    // Only send email if model backwards compat test failed
+}
+,
+failure_handler: {
+// Only send email if model backwards compat test failed
     if (currentBuild.result == "FAILURE") {
     	emailext body: 'Nightly tests for model backwards compatibity on MXNet branch : ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[MODEL BACKWARDS COMPATIBILITY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}'
     }
-    // Remember to rethrow so the build is marked as failing
-    if (err) {
-      throw err
-    }
-  }
 }
+)
+
diff --git a/tests/nightly/mxnet_keras_integration_tests/assertion_util.py b/tests/nightly/mxnet_keras_integration_tests/assertion_util.py
deleted file mode 100644
index eb3d3bd85fda..000000000000
--- a/tests/nightly/mxnet_keras_integration_tests/assertion_util.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-from nose.tools import assert_true
-
-def assert_results(MACHINE_TYPE, IS_GPU, GPU_NUM, profile_output, CPU_BENCHMARK_RESULTS, GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS, GPU_4_BENCHMARK_RESULTS, GPU_8_BENCHMARK_RESULTS):
-    """
-        Helps in asserting benchmarking results.
-        Compares actual output result in profile_output with expected result in
-        CPU_BENCHMARK_RESULTS if IS_GPU is True.
-        Else, compares with GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS
-        GPU_4_BENCHMARK_RESULTS and GPU_8_BENCHMARK_RESULTS.
-
-        Uses keys - MODEL, TRAINING_TIME, MEM_CONSUMPTION, TRAIN_ACCURACY and TEST_ACCURACY
-        to fetch data from provided actual and expected results input map stated above.
-    """
-    # Model type
-    model = profile_output['MODEL']
-
-    # Actual values.
-    actual_training_time = profile_output['TRAINING_TIME']
-    actual_memory_consumption = profile_output['MEM_CONSUMPTION']
-    actual_train_accuracy = profile_output['TRAIN_ACCURACY']
-    actual_test_accuracy = profile_output['TEST_ACCURACY']
-
-    # Expected values
-    expected_training_time = 0.0
-    expected_memory_consumption = 0.0
-    expected_train_accuracy = 1.0
-    expected_test_accuracy = 1.0
-
-    # Set right set of expected values based on current run type
-    if(IS_GPU):
-        if GPU_NUM == 1:
-            expected_training_time = GPU_1_BENCHMARK_RESULTS['TRAINING_TIME']
-            expected_memory_consumption = GPU_1_BENCHMARK_RESULTS['MEM_CONSUMPTION']
-            expected_train_accuracy = GPU_1_BENCHMARK_RESULTS['TRAIN_ACCURACY']
-            expected_test_accuracy = GPU_1_BENCHMARK_RESULTS['TEST_ACCURACY']
-        elif GPU_NUM == 2:
-            expected_training_time = GPU_2_BENCHMARK_RESULTS['TRAINING_TIME']
-            expected_memory_consumption = GPU_2_BENCHMARK_RESULTS['MEM_CONSUMPTION']
-            expected_train_accuracy = GPU_2_BENCHMARK_RESULTS['TRAIN_ACCURACY']
-            expected_test_accuracy = GPU_2_BENCHMARK_RESULTS['TEST_ACCURACY']
-        elif GPU_NUM == 4:
-            expected_training_time = GPU_4_BENCHMARK_RESULTS['TRAINING_TIME']
-            expected_memory_consumption = GPU_4_BENCHMARK_RESULTS['MEM_CONSUMPTION']
-            expected_train_accuracy = GPU_4_BENCHMARK_RESULTS['TRAIN_ACCURACY']
-            expected_test_accuracy = GPU_4_BENCHMARK_RESULTS['TEST_ACCURACY']
-        elif GPU_NUM == 8:
-            expected_training_time = GPU_8_BENCHMARK_RESULTS['TRAINING_TIME']
-            expected_memory_consumption = GPU_8_BENCHMARK_RESULTS['MEM_CONSUMPTION']
-            expected_train_accuracy = GPU_8_BENCHMARK_RESULTS['TRAIN_ACCURACY']
-            expected_test_accuracy = GPU_8_BENCHMARK_RESULTS['TEST_ACCURACY']
-    else:
-        expected_training_time = CPU_BENCHMARK_RESULTS['TRAINING_TIME']
-        expected_memory_consumption = CPU_BENCHMARK_RESULTS['MEM_CONSUMPTION']
-        expected_train_accuracy = CPU_BENCHMARK_RESULTS['TRAIN_ACCURACY']
-        expected_test_accuracy = CPU_BENCHMARK_RESULTS['TEST_ACCURACY']
-
-    # Validate Results
-    assert_true(actual_training_time < expected_training_time,'{0} on {1} machine with {2} GPU usage FAILED. Expected Training Time - {3} secs but was {4} secs.'.format(model, MACHINE_TYPE, GPU_NUM, expected_training_time, actual_training_time))
-    assert_true(actual_memory_consumption < expected_memory_consumption, '{0} on {1} machine with {2} GPU usage FAILED. Expected Mem Consumption - {3} MB but was {4} MB.'.format(model, MACHINE_TYPE, GPU_NUM, expected_memory_consumption, actual_memory_consumption))
-    assert_true(actual_train_accuracy > expected_train_accuracy, '{0} on {1} machine with {2} GPU usage FAILED. Expected Train Accuracy - {3} but was {4}.'.format(model, MACHINE_TYPE, GPU_NUM, expected_train_accuracy, actual_train_accuracy))
-    assert_true(actual_test_accuracy > expected_test_accuracy, '{0} on {1} machine with {2} GPU usage FAILED. Expected Test Accuracy - {3} but was {4}.'.format(model, MACHINE_TYPE, GPU_NUM, expected_test_accuracy, actual_test_accuracy))
diff --git a/tests/nightly/mxnet_keras_integration_tests/model_util.py b/tests/nightly/mxnet_keras_integration_tests/model_util.py
deleted file mode 100644
index bb9d6374af8f..000000000000
--- a/tests/nightly/mxnet_keras_integration_tests/model_util.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from keras import backend as K
-from keras.models import Model
-from keras.layers import Input, merge
-from keras.layers.core import Lambda
-
-# Before running the integration tests, users are expected to set these
-# environment variables.
-IS_GPU = (os.environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
-GPU_NUM = int(os.environ['GPU_NUM']) if IS_GPU else 0
-KERAS_BACKEND = os.environ['KERAS_BACKEND']
-
-def slice_batch(x, n_gpus, part):
-    sh = K.shape(x)
-    L = sh[0] / n_gpus
-    if part == n_gpus - 1:
-        return x[part*L:]
-    return x[part*L:(part+1)*L]
-
-def prepare_gpu_model(model, **kwargs):
-    gpu_list = []
-    for i in range(GPU_NUM):
-        gpu_list.append('gpu(%d)' % i)
-    if KERAS_BACKEND == 'mxnet':
-        kwargs['context'] = gpu_list
-        model.compile(**kwargs)
-    else:
-        model.compile(**kwargs)
-
-def prepare_cpu_model(model, **kwargs):
-    model.compile(**kwargs)
-
-def make_model(model, **kwargs):
-    """
-        Compiles the Keras Model object for given backend type and machine type.
-        Use this function to write one Keras code and run it across different machine type.
-
-        If environment variable - MXNET_KERAS_TEST_MACHINE is set to CPU, then Compiles
-        Keras Model for running on CPU.
-
-        If environment variable - MXNET_KERAS_TEST_MACHINE is set to GPU, then Compiles
-        Keras Model running on GPU using number of GPUs equal to number specified in
-        GPU_NUM environment variable.
-
-        Currently supports only MXNet as Keras backend.
-    """
-    if(IS_GPU):
-        prepare_gpu_model(model, **kwargs)
-    else:
-        prepare_cpu_model(model, **kwargs)
-    return model
diff --git a/tests/nightly/mxnet_keras_integration_tests/profiler.py b/tests/nightly/mxnet_keras_integration_tests/profiler.py
deleted file mode 100644
index b0d39e19aa00..000000000000
--- a/tests/nightly/mxnet_keras_integration_tests/profiler.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import signal
-import time
-import csv
-import subprocess
-from memory_profiler import memory_usage
-
-IS_GPU = (os.environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
-GPU_NUM = int(os.environ['GPU_NUM']) if IS_GPU else 0
-
-# This command is useful to fetch GPU memory consumption.
-GPU_MONITOR_CMD = "nvidia-smi --query-gpu=index,memory.used --format=csv -lms 500 -f output.csv"
-
-def cpu_memory_profile(func_to_profile):
-    max_mem_usage = memory_usage(proc=(func_to_profile, ()), max_usage=True)
-    return max_mem_usage[0]
-
-def gpu_mem_profile(file_name):
-    row_count = 0
-    # In MBs
-    max_mem_usage = 0
-    with open(file_name, 'r') as csv_file:
-        csv_reader = csv.reader(csv_file)
-        last_line_broken = False
-        for row in csv_reader:
-            if row_count == 0:
-                row_count += 1
-                continue
-            if len(row) < 2 or not 'MiB' in row[1]:
-                last_line_broken = True
-            row_count += 1
-        row_count -= 1
-        if row_count % GPU_NUM == 0 and last_line_broken:
-            row_count -= GPU_NUM
-        else:
-            row_count -= row_count % GPU_NUM
-
-    with open(file_name, 'r') as csv_file:
-        csv_reader = csv.reader(csv_file)
-        current_usage = 0
-        mem_recoder = [0] * GPU_NUM
-        row_num = 0
-        for row in csv_reader:
-            if row_num == 0:
-                row_num += 1
-                continue
-            mem_str = row[1].lstrip().rstrip()[:-4]
-            mem_num = float(mem_str)
-            current_usage += mem_num
-            mem_recoder[(row_num - 1) % GPU_NUM] += mem_num
-            if row_num % GPU_NUM == 0:
-                max_mem_usage = max(max_mem_usage, current_usage)
-                current_usage = 0
-            row_num += 1
-            if row_num > row_count:
-                break
-        row_num -= 1
-    os.remove(file_name)
-    return max_mem_usage
-
-def profile(func_to_profile):
-    """
-        This function helps in profile given func_to_profile for run-time and
-        memory consumption.
-
-        Capable of profile for both GPU and CPU machine.
-
-        Uses environment variable - IS_GPU to identify whether to profile for
-        CPU or GPU.
-
-        returns: run_time, memory_usage
-    """
-    run_time = 0; # Seconds
-    memory_usage = 0; # MBs
-
-    # Choose nvidia-smi or memory_profiler for memory profiling for GPU and CPU
-    # machines respectively.
-    if(IS_GPU):
-        # Start time - For timing the runtime
-        start_time = time.time()
-        open('nvidia-smi-output.csv', 'a').close()
-        gpu_monitor_process = subprocess.Popen(GPU_MONITOR_CMD,
-                                                  shell=True, preexec_fn=os.setsid)
-        func_to_profile()
-        end_time = time.time()
-        os.killpg(os.getpgid(gpu_monitor_process.pid), signal.SIGTERM)
-        run_time = end_time - start_time
-        memory_usage = gpu_mem_profile('nvidia-smi-output.csv')
-    else:
-        # Start time - For timing the runtime
-        start_time = time.time()
-        memory_usage = cpu_memory_profile(func_to_profile)
-        end_time = time.time()
-        run_time = end_time - start_time
-
-    return run_time, memory_usage
diff --git a/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py b/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py
deleted file mode 100644
index 89bd2805ce78..000000000000
--- a/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-This code is forked from https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py
-and modified to use as MXNet-Keras integration testing for functionality and sanity performance
-benchmarking.
-
-Trains a simple deep NN on the MNIST dataset.
-
-Gets to 98.40% test accuracy after 20 epochs
-(there is *a lot* of margin for parameter tuning).
-2 seconds per epoch on a K520 GPU.
-'''
-
-from __future__ import print_function
-import numpy as np
-np.random.seed(1337)  # for reproducibility
-
-from os import environ
-
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers.core import Dense, Dropout, Activation
-from keras.optimizers import SGD
-from keras.utils import np_utils
-
-# Imports for benchmarking
-from profiler import profile
-from model_util import make_model
-
-# Imports for assertions
-from assertion_util import assert_results
-
-# Other environment variables
-MACHINE_TYPE = environ['MXNET_KERAS_TEST_MACHINE']
-IS_GPU = (environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
-MACHINE_TYPE = 'GPU' if IS_GPU else 'CPU'
-GPU_NUM = int(environ['GPU_NUM']) if IS_GPU else 0
-
-# Expected Benchmark Numbers
-CPU_BENCHMARK_RESULTS = {'TRAINING_TIME':550.0, 'MEM_CONSUMPTION':400.0, 'TRAIN_ACCURACY': 0.85, 'TEST_ACCURACY':0.85}
-GPU_1_BENCHMARK_RESULTS = {'TRAINING_TIME':40.0, 'MEM_CONSUMPTION':200, 'TRAIN_ACCURACY': 0.85, 'TEST_ACCURACY':0.85}
-# TODO: Fix Train and Test accuracy numbers in multiple gpu mode. Setting it to 0 for now to get whole integration set up done
-GPU_2_BENCHMARK_RESULTS = {'TRAINING_TIME':45.0, 'MEM_CONSUMPTION':375, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
-GPU_4_BENCHMARK_RESULTS = {'TRAINING_TIME':55.0, 'MEM_CONSUMPTION':750.0, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
-GPU_8_BENCHMARK_RESULTS = {'TRAINING_TIME':100.0, 'MEM_CONSUMPTION':1800.0, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
-
-# Dictionary to store profiling output
-profile_output = {}
-
-batch_size = 128
-nb_classes = 10
-nb_epoch = 20
-
-# the data, shuffled and split between train and test sets
-(X_train, y_train), (X_test, y_test) = mnist.load_data()
-
-X_train = X_train.reshape(60000, 784)
-X_test = X_test.reshape(10000, 784)
-X_train = X_train.astype('float32')
-X_test = X_test.astype('float32')
-X_train /= 255
-X_test /= 255
-
-# convert class vectors to binary class matrices
-Y_train = np_utils.to_categorical(y_train, nb_classes)
-Y_test = np_utils.to_categorical(y_test, nb_classes)
-
-model = Sequential()
-model.add(Dense(512, input_shape=(784,)))
-model.add(Activation('relu'))
-model.add(Dropout(0.2))
-model.add(Dense(512))
-model.add(Activation('relu'))
-model.add(Dropout(0.2))
-model.add(Dense(10))
-model.add(Activation('softmax'))
-
-model.summary()
-make_model(model, loss='categorical_crossentropy', optimizer=SGD(), metrics=['accuracy'])
-
-def train_model():
-    history = model.fit(X_train, Y_train,
-                    batch_size=batch_size, nb_epoch=nb_epoch,
-                    verbose=1, validation_data=(X_test, Y_test))
-    profile_output['TRAIN_ACCURACY'] = history.history['acc'][-1]
-
-def test_run():
-    # Calling training and profile memory usage
-    profile_output["MODEL"] = "MNIST MLP"
-    run_time, memory_usage = profile(train_model)
-
-    profile_output['TRAINING_TIME'] = float(run_time)
-    profile_output['MEM_CONSUMPTION'] = float(memory_usage)
-
-    score = model.evaluate(X_test, Y_test, verbose=0)
-    profile_output["TEST_ACCURACY"] = score[1]
-
-    assert_results(MACHINE_TYPE, IS_GPU, GPU_NUM, profile_output, CPU_BENCHMARK_RESULTS, GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS, GPU_4_BENCHMARK_RESULTS, GPU_8_BENCHMARK_RESULTS)
diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
index 2038ada3a8b2..ef07550bdf78 100644
--- a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
@@ -20,6 +20,7 @@
     This file tests that the notebooks requiring multi GPUs run without
     warning or exception.
 """
+import logging
 import unittest
 from straight_dope_test_utils import _test_notebook
 from straight_dope_test_utils import _download_straight_dope_notebooks
@@ -27,6 +28,7 @@
 class StraightDopeMultiGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
     # Chapter 7
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index ee7c94c80afc..fca49f43adee 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -21,6 +21,7 @@
     warning or exception.
 """
 import glob
+import logging
 import re
 import os
 import unittest
@@ -51,9 +52,9 @@
 class StraightDopeSingleGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
-
     def test_completeness(self):
         """
         Make sure that every tutorial that isn't in the whitelist is considered for testing by this
@@ -89,9 +90,8 @@ def test_linear_algebra(self):
     def test_probability(self):
         assert _test_notebook('chapter01_crashcourse/probability')
 
-    # TODO(vishaalk): Notebook contains the word 'Warning'. Needs to be updated to a synonym.
-    #def test_autograd(self):
-    #    assert _test_notebook('chapter01_crashcourse/autograd')
+    def test_autograd(self):
+        assert _test_notebook('chapter01_crashcourse/autograd')
 
     # Chapter 2
 
@@ -113,9 +113,8 @@ def test_softmax_regression_gluon(self):
     def test_regularization_scratch(self):
         assert _test_notebook('chapter02_supervised-learning/regularization-scratch')
 
-    # TODO(vishaalk): Notebook does not appear to be JSON: '{\n "cells": [\n  {\n   "cell_type": "m....
-    #def test_regularization_gluon(self):
-    #    assert _test_notebook('chapter02_supervised-learning/regularization-gluon')
+    def test_regularization_gluon(self):
+        assert _test_notebook('chapter02_supervised-learning/regularization-gluon')
 
     def test_perceptron(self):
         assert _test_notebook('chapter02_supervised-learning/perceptron')
@@ -258,9 +257,8 @@ def test_tree_lstm(self):
     def test_lds_scratch(self):
         assert _test_notebook('chapter12_time-series/lds-scratch')
 
-    # TODO(vishaalk): File doesn't appear to be valid JSON.
-    #def test_issm_scratch(self):
-    #    assert _test_notebook('chapter12_time-series/issm-scratch')
+    def test_issm_scratch(self):
+        assert _test_notebook('chapter12_time-series/issm-scratch')
 
     # Chapter 14
 
@@ -273,7 +271,7 @@ def test_dcgan(self):
     def test_generative_adversarial_networks(self):
         assert _test_notebook('chapter14_generative-adversarial-networks/conditional')
 
-    # TODO(vishaalk): Notebook does not appear to be valid JSON.
+    # TODO(vishaalk): Investigate.
     #def test_pixel2pixel(self):
     #    assert _test_notebook('chapter14_generative-adversarial-networks/pixel2pixel')
 
diff --git a/tests/nightly/test_mxnet_keras_integration_cpu.sh b/tests/nightly/test_mxnet_keras_integration_cpu.sh
deleted file mode 100755
index 95cc0d0760e2..000000000000
--- a/tests/nightly/test_mxnet_keras_integration_cpu.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-### Build MXNet with CPU support
-echo "BUILD make"
-cp ./make/config.mk .
-echo "USE_CUDA=0" >> ./config.mk
-echo "USE_CUDNN=0" >> ./config.mk
-echo "USE_BLAS=openblas" >> ./config.mk
-echo "ADD_CFLAGS += -I/usr/include/openblas" >> ./config.mk
-echo "GTEST_PATH=/usr/local/gtest" >> ./config.mk
-echo 'export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH' >> ~/.profile
-echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.profile
-echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64' >> ~/.profile
-echo 'export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre' >> ~/.profile
-echo 'export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin' >> ~/.profile
-source ~/.profile
-make clean
-make -j 4 || exit -1
-
-echo "BUILD python2 mxnet"
-cd ./python
-python setup.py install || exit 1
-
-echo "BUILD python3 mxnet"
-python3 setup.py install || exit 1
-
-# Come out of Mxnet directory.
-cd ..
-
-# Required for Keras installation
-pip install pyyaml
-
-# If already exist remove and fork DMLC/keras and install.
-# Note: This should eventually be replaced with pip install when mxnet backend is part of fchollet/keras
-
-########### Set up Keras ####################
-echo "Installing Keras. This can take few minutes..."
-# Clone keras repository from dmlc. This has mxnet backend implementated.
-if [ -d "keras" ]; then
-  rm -rf keras/
-fi
-
-git clone https://github.com/dmlc/keras.git --recursive
-cd keras
-python setup.py install
-
-########### Set up packages for profiling #########
-echo "Installing memory_profile and psutil for profiling. This can take few minutes..."
-pip install memory_profiler
-pip install psutil
-
-########## Set Environment Variables ########
-echo "Setting Environment Variables for MXNet Keras Integration Tests on CPU machine"
-export KERAS_BACKEND="mxnet"
-export MXNET_KERAS_TEST_MACHINE='CPU'
-
-########## Call the test script ############
-cd ../../mxnet/tests/nightly
-echo "Running MXNet Keras Integration Test on CPU machine"
-nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
diff --git a/tests/nightly/test_mxnet_keras_integration_gpu.sh b/tests/nightly/test_mxnet_keras_integration_gpu.sh
deleted file mode 100755
index 5d541fa5b7a4..000000000000
--- a/tests/nightly/test_mxnet_keras_integration_gpu.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-### Install git
-apt-get update
-apt-get install git-all
-
-### Build MXNet with CPU support
-echo "BUILD make"
-cp ./make/config.mk .
-echo "USE_CUDA=1" >> ./config.mk
-echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk
-echo "USE_CUDNN=1" >> ./config.mk
-echo "USE_BLAS=openblas" >> ./config.mk
-echo "ADD_CFLAGS += -I/usr/include/openblas" >> ./config.mk
-echo "GTEST_PATH=/usr/local/gtest" >> ./config.mk
-export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
-export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64
-export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre
-export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin
-
-make clean
-make -j 4 || exit -1
-
-echo "BUILD python2 mxnet"
-cd ./python
-python setup.py install || exit 1
-
-echo "BUILD python3 mxnet"
-python3 setup.py install || exit 1
-
-# Come out of MXNet directory
-cd ..
-
-# Dependencies required for Keras installation
-pip install pyyaml
-
-pip install --upgrade pip
-pip install --upgrade six
-
-# If already exist remove and fork DMLC/keras and install.
-# Note: This should eventually be replaced with pip install when mxnet backend is part of fchollet/keras
-
-########### Set up Keras ####################
-echo "Installing Keras. This can take few minutes..."
-# Clone keras repository from dmlc. This has mxnet backend implementated.
-if [ -d "keras" ]; then
-  rm -rf keras/
-fi
-
-git clone https://github.com/dmlc/keras.git --recursive
-cd keras
-python setup.py install
-
-########### Set up packages for profiling #########
-echo "Installing memory_profile and psutil for profiling. This can take few minutes..."
-pip install memory_profiler
-pip install psutil
-
-########## Set Environment Variables ########
-echo "Setting Environment Variables for MXNet Keras Integration Tests on CPU machine"
-cd ../../mxnet/tests/nightly
-
-export KERAS_BACKEND="mxnet"
-export MXNET_KERAS_TEST_MACHINE='GPU'
-########## Call the test script with 1 GPUS ############
-
-export GPU_NUM='1'
-echo "Running MXNet Keras Integration Test on GPU machine with 1 GPUs"
-nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
-
-########## Call the test script with 2 GPUS ############
-
-export GPU_NUM='2'
-echo "Running MXNet Keras Integration Test on GPU machine with 2 GPUs"
-nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
-
-########## Call the test script with 4 GPUS ############
-
-export GPU_NUM='4'
-echo "Running MXNet Keras Integration Test on GPU machine with 4 GPUs"
-nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
-
-########## Call the test script with 8 GPUS ############
-
-export GPU_NUM='8'
-echo "Running MXNet Keras Integration Test on GPU machine with 8 GPUs"
-nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 3d799aa5319b..5612b0a647ed 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -127,7 +127,7 @@ def check_ifft(shape):
             init_complex.real[:,i] = init[0][:,2*i]
             init_complex.imag[:,i] = init[0][:,2*i+1]
         a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, out1[0]/shape_old[1],rtol=1e-3, atol=1e-12)
+        assert_almost_equal(a.real, out1[0]/shape_old[1],rtol=1e-3, atol=1e-5)
 
     if len(shape) == 4:
         init_complex = np.zeros(shape_old,dtype = np.complex64)
@@ -135,7 +135,7 @@ def check_ifft(shape):
             init_complex.real[:,:,:,i] = init[0][:,:,:,2*i]
             init_complex.imag[:,:,:,i] = init[0][:,:,:,2*i+1]
         a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, out1[0]/shape_old[3],rtol=1e-3, atol=1e-12)
+        assert_almost_equal(a.real, out1[0]/shape_old[3],rtol=1e-3, atol=1e-5)
     # backward
     if len(shape) == 2:
         out_grad = mx.nd.empty(shape_old)
@@ -148,7 +148,7 @@ def check_ifft(shape):
                 temp[:,i] = exe.grad_arrays[0].asnumpy()[:,2*i]
 
         a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-12)
+        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-5)
     if len(shape) == 4:
         out_grad = mx.nd.empty(shape_old)
         out_grad[:] = np.random.normal(-3, 3, shape_old)
@@ -160,9 +160,9 @@ def check_ifft(shape):
                 temp[:,:,:,i] = exe.grad_arrays[0].asnumpy()[:,:,:,2*i]
 
         a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-12)
+        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-5)
 
-@with_seed(0)
+@with_seed()
 def test_ifft():
     nrepeat = 2
     maxdim = 10
@@ -194,7 +194,7 @@ def check_fft(shape):
     for exe in exe_list:
         for arr, iarr in zip(exe.arg_arrays, init):
             arr[:] = iarr.astype(arr.dtype)
-    #forward
+    # forward
     for exe in exe_list:
         exe.forward(is_train=True)
     out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
@@ -221,7 +221,7 @@ def check_fft(shape):
                     a[i,j,:,p+1] = out2[i,j+out1[0].shape[1],:,k]
                     p = p+2
 
-    assert_almost_equal(a, out1[0],rtol=1e-3, atol=1e-6)
+    assert_almost_equal(a, out1[0],rtol=1e-3, atol=1e-5)
 
     # backward
     if len(shape) == 2:
@@ -235,7 +235,7 @@ def check_fft(shape):
         for exe in exe_list:
             exe.backward([out_grad])
         a = np.fft.ifft(out_grad_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[1],rtol=1e-3, atol=1e-8)
+        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[1],rtol=1e-3, atol=1e-5)
 
     if len(shape) == 4:
         out_grad = mx.nd.empty(out1[0].shape)
@@ -248,9 +248,9 @@ def check_fft(shape):
         for exe in exe_list:
             exe.backward([out_grad])
         a = np.fft.ifft(out_grad_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[3],rtol=1e-3, atol=1e-6)
+        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[3],rtol=1e-3, atol=1e-5)
 
-@with_seed(0)
+@with_seed()
 def test_fft():
     nrepeat = 2
     maxdim = 10
@@ -614,13 +614,13 @@ def test_pooling_with_type():
                 {'ctx': mx.cpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
                 {'ctx': mx.cpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': np.float32}}]
     sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid', name='pool')
-    check_consistency(sym, ctx_list)
+    check_consistency(sym, ctx_list, rand_type=np.float16)
 
     sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full', name='pool')
-    check_consistency(sym, ctx_list)
+    check_consistency(sym, ctx_list, rand_type=np.float16)
 
     sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
-    check_consistency(sym, ctx_list)
+    check_consistency(sym, ctx_list, rand_type=np.float16)
 
 
 @with_seed()
@@ -765,11 +765,8 @@ def test_spatial_transformer_with_type():
     check_consistency(sym, ctx_list, grad_req="add")
 
 
-# Checking max pooling consistency over the data sets of different float types is problematic
-# as one max value in a float32 data set may not be the max value in a float16 data set.
-# This function will not be called.
-@with_seed(1234)
-def test_pooling_with_type():
+@with_seed()
+def test_pooling_with_type2():
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
                 {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float32}},
                 {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float16}},
@@ -777,19 +774,17 @@ def test_pooling_with_type():
                 {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float32}}]
 
     sym = mx.sym.Pooling(name='pool', kernel=(3,3), stride=(2,2), pool_type='max')
-    check_consistency(sym, ctx_list)
+    check_consistency(sym, ctx_list, rand_type=np.float16)
 
     sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='avg')
     check_consistency(sym, ctx_list)
 
-    # this is unstable
-    # sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
-    # check_consistency(sym, ctx_list)
+    sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
+    check_consistency(sym, ctx_list, rand_type=np.float16)
 
     sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='sum')
     check_consistency(sym, ctx_list)
 
-
 @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/11517")
 @with_seed()
 def test_pooling_versions():
@@ -1445,7 +1440,7 @@ def test_unfuse():
         check_rnn_consistency(stack, fused)
 
 
-@with_seed(1234)
+@with_seed()
 def test_psroipooling_with_type():
     arg_params = {
         'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
@@ -1470,8 +1465,12 @@ def test_psroipooling_with_type():
                                                'psroipool_rois': 'null'}, arg_params=arg_params)
 
 
-@with_seed(1234)
+@with_seed()
 def test_deformable_psroipooling_with_type():
+    tol = {np.dtype(np.float32): 1e-1,
+           np.dtype(np.float64): 1e-3,
+           np.dtype(np.float16): 1e-2}
+
     arg_params = {
         'deformable_psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
 
@@ -1499,13 +1498,17 @@ def test_deformable_psroipooling_with_type():
                                'deformable_psroipool_trans': np.float16}},
                 ]
 
-    check_consistency(sym, ctx_list, grad_req={'deformable_psroipool_data': 'write',
-                                               'deformable_psroipool_rois': 'null',
-                                               'deformable_psroipool_trans': 'write'}, arg_params=arg_params)
+    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+                      grad_req={'deformable_psroipool_data': 'write',
+                                'deformable_psroipool_rois': 'null',
+                                'deformable_psroipool_trans': 'write'}, arg_params=arg_params)
 
 
-@with_seed(1234)
+@with_seed()
 def test_deformable_convolution_with_type():
+    tol = {np.dtype(np.float32): 1e-1,
+           np.dtype(np.float64): 1e-3}
+
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv')
     # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1521,18 +1524,14 @@ def test_deformable_convolution_with_type():
                 #  'deformable_conv_offset': (2, 18, 8, 8),
                 #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_conv_offset': np.float16}},
                 ]
-    # wider tolerance needed for true-fp16 NCHW test above
-    tol = {np.dtype(np.float16): 0.5,
-               np.dtype(np.float32): 1e-3,
-               np.dtype(np.float64): 1e-5,
-               np.dtype(np.uint8): 0,
-               np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
+
+    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
     # test ability to turn off training on bias
-    check_consistency(sym, ctx_list, grad_req={'deformable_conv_data': 'write',
-                                               'deformable_conv_offset': 'write',
-                                               'deformable_conv_weight': 'write',
-                                               'deformable_conv_bias': 'null'}, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+                      grad_req={'deformable_conv_data': 'write',
+                                'deformable_conv_offset': 'write',
+                                'deformable_conv_weight': 'write',
+                                'deformable_conv_bias': 'null'})
 
 
 @with_seed()
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index ff9ba538b95e..03f3c76bb65b 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -26,6 +26,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
+import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -261,4 +262,4 @@ def check_fullyconnected_training(stype):
 
 
 if __name__ == '__main__':
-    test_mkldnn_install()
+    install.test_mkldnn_install()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 08303c816af1..369a923c1879 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -57,7 +57,7 @@ def test_quantize_float32_to_int8():
     assert same(min_val.asscalar(), -real_range)
     assert same(max_val.asscalar(), real_range)
     qdata_np = (np.sign(data_np) * np.minimum(np.abs(data_np) * scale + 0.5, quantized_range)).astype(np.int8)
-    assert same(qdata.asnumpy(), qdata_np)
+    assert_almost_equal(qdata.asnumpy(), qdata_np, atol = 1)
 
 
 @with_seed()
@@ -77,7 +77,6 @@ def test_dequantize_int8_to_float32():
 
 
 @with_seed()
-@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/11747')
 def test_requantize_int32_to_int8():
     def quantized_int32_to_float(qdata, min_range, max_range):
         assert qdata.dtype == 'int32'
@@ -121,7 +120,7 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
                                                                           max_range.asscalar(),
                                                                           min_calib_range=min_calib_range,
                                                                           max_calib_range=max_calib_range)
-        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np)
+        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
         assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
         assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
 
@@ -397,6 +396,17 @@ def get_fp32_sym():
                                out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
     return sym
 
+def get_fp32_residual():
+    data = mx.sym.Variable('data')
+    conv = mx.sym.Convolution(data=data, num_filter=4, kernel=(1,1), pad=(0,0),
+                              no_bias=True, name='conv')
+    bn = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn')
+    act = mx.sym.Activation(data=bn + data, act_type='relu', name='relu')
+    pool = mx.sym.Pooling(act, kernel=(4, 4), pool_type='avg', name='pool')
+    fc = mx.sym.FullyConnected(pool, num_hidden=10, flatten=True, name='fc')
+    sym = mx.sym.SoftmaxOutput(fc, grad_scale=1, ignore_label=-1, multi_output=False,
+                               out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
+    return sym 
 
 @with_seed()
 def test_quantize_model():
@@ -464,6 +474,101 @@ def check_qsym_qdtype(qsym, qdtype):
     for qdtype in ['int8', 'uint8']:
         check_quantize_model(qdtype)
 
+@with_seed()
+def test_quantize_residual_unit():
+    def check_quantize_model(qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_residual_unit for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing quantized_residual_unit for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
+            print('skipped testing quantized_residual_unit for gpu uint8 since it is not supported yet')
+            return
+
+        def check_params(params, qparams, qsym=None):
+            if qsym is None:
+                assert len(params) == len(qparams)
+                for k, v in params.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+            else:
+                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params)
+                assert len(qparams) == len(qparams_ground_truth)
+                for k, v in qparams_ground_truth.items():
+                    assert k in qparams
+                    assert same(v.asnumpy(), qparams[k].asnumpy())
+
+        def check_qsym_calibrated(qsym):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('requantize_') != -1:
+                    assert 'min_calib_range' in v
+                    assert 'max_calib_range' in v
+
+        def check_qsym_qdtype(qsym, qdtype):
+            attrs = qsym.attr_dict()
+            for k, v in attrs.items():
+                if k.find('_quantize') != -1:
+                    assert 'out_type' in v
+                    assert v['out_type'] == qdtype
+
+        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
+            mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
+            mod.bind(for_training=False,
+                     data_shapes=[('data', data_shape)],
+                     label_shapes=[('softmax_label', label_shape)])
+            mod.set_params(qarg_params, qaux_params)
+            data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
+            batch = mx.io.DataBatch(data, [])
+            mod.forward(batch, is_train=False)
+            for output in mod.get_outputs():
+                output.wait_to_read()
+             
+
+        sym = get_fp32_residual()
+        mod = Module(symbol=sym)
+        batch_size = 4
+        data_shape = (batch_size, 4, 10, 10)
+        label_shape = (batch_size, 10)
+        mod.bind(data_shapes=[('data', data_shape)], label_shapes=[('softmax_label', label_shape)])
+        mod.init_params()
+        arg_params, aux_params = mod.get_params()
+        excluded_sym_names = []
+        if mx.current_context() == mx.cpu():
+           excluded_sym_names += ['fc']
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         excluded_sym_names=excluded_sym_names,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='none')
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape)
+
+        calib_data = mx.nd.random.uniform(shape=data_shape)
+        calib_data = NDArrayIter(data=calib_data)
+        calib_data = DummyIter(calib_data)
+        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
+                                                                         arg_params=arg_params,
+                                                                         aux_params=aux_params,
+                                                                         excluded_sym_names=excluded_sym_names,
+                                                                         ctx=mx.current_context(),
+                                                                         quantized_dtype=qdtype,
+                                                                         calib_mode='naive',
+                                                                         calib_data=calib_data,
+                                                                         num_calib_examples=20)
+        check_params(arg_params, qarg_params, qsym)
+        check_params(aux_params, qaux_params)
+        check_qsym_calibrated(qsym)
+        check_qsym_qdtype(qsym, qdtype)
+        check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape)
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantize_model(qdtype)
 
 @with_seed()
 def test_quantize_sym_with_calib():
@@ -510,7 +615,6 @@ def test_optimal_threshold_adversarial_case():
 
 
 @with_seed()
-@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/11456")
 def test_get_optimal_thresholds():
     # Given an ndarray with elements following a uniform distribution, the optimal threshold
     # for quantizing the ndarray should be either abs(min(nd)) or abs(max(nd)).
@@ -519,11 +623,11 @@ def get_threshold(nd):
         max_nd = mx.nd.max(nd)
         return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
 
-    nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23))}
+    nd_dict = {'layer1': mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)}
     expected_threshold = get_threshold(nd_dict['layer1'])
     th_dict = mx.contrib.quant._get_optimal_thresholds(nd_dict)
     assert 'layer1' in th_dict
-    assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=0.001, atol=0.001)
+    assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/tensorrt/common.py b/tests/python/tensorrt/common.py
new file mode 100644
index 000000000000..b37f8f3ff809
--- /dev/null
+++ b/tests/python/tensorrt/common.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from ctypes.util import find_library
+
+
+def check_tensorrt_installation():
+    assert find_library('nvinfer') is not None, "Can't find the TensorRT shared library"
+
+
+def merge_dicts(*dict_args):
+    """Merge arg_params and aux_params to populate shared_buffer"""
+    result = {}
+    for dictionary in dict_args:
+        result.update(dictionary)
+    return result
diff --git a/tests/python/tensorrt/lenet5_common.py b/tests/python/tensorrt/lenet5_common.py
new file mode 100644
index 000000000000..347d6f3c11ba
--- /dev/null
+++ b/tests/python/tensorrt/lenet5_common.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+from common import *
+
+def get_iters(mnist, batch_size):
+    """Get MNIST iterators."""
+    train_iter = mx.io.NDArrayIter(mnist['train_data'],
+                                   mnist['train_label'],
+                                   batch_size,
+                                   shuffle=True)
+    val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
+    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
+    all_test_labels = np.array(mnist['test_label'])
+    return train_iter, val_iter, test_iter, all_test_labels
diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py
new file mode 100644
index 000000000000..8edd9abf70e7
--- /dev/null
+++ b/tests/python/tensorrt/lenet5_train.py
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import mxnet as mx
+from lenet5_common import get_iters
+
+
+def lenet5():
+    """LeNet-5 Symbol"""
+    #pylint: disable=no-member
+    data = mx.sym.Variable('data')
+    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
+    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
+    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max",
+                           kernel=(2, 2), stride=(2, 2))
+    # second conv
+    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
+    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
+    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max",
+                           kernel=(2, 2), stride=(2, 2))
+    # first fullc
+    flatten = mx.sym.Flatten(data=pool2)
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=500)
+    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
+    # second fullc
+    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
+    # loss
+    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
+    #pylint: enable=no-member
+    return lenet
+
+
+def train_lenet5(num_epochs, batch_size, train_iter, val_iter, test_iter):
+    """train LeNet-5 model on MNIST data"""
+    ctx = mx.gpu(0)
+    lenet_model = mx.mod.Module(lenet5(), context=ctx)
+
+    lenet_model.fit(train_iter,
+                    eval_data=val_iter,
+                    optimizer='sgd',
+                    optimizer_params={'learning_rate': 0.1, 'momentum': 0.9},
+                    eval_metric='acc',
+                    batch_end_callback=mx.callback.Speedometer(batch_size, 1),
+                    num_epoch=num_epochs)
+
+    # predict accuracy for lenet
+    acc = mx.metric.Accuracy()
+    lenet_model.score(test_iter, acc)
+    accuracy = acc.get()[1]
+    assert accuracy > 0.95, "LeNet-5 training accuracy on MNIST was too low"
+    return lenet_model
+
+
+if __name__ == '__main__':
+    num_epochs = 10
+    batch_size = 128
+    model_name = 'lenet5'
+    model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
+    model_file = '%s/%s-symbol.json' % (model_dir, model_name)
+    params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
+
+    if not (os.path.exists(model_file) and os.path.exists(params_file)):
+        mnist = mx.test_utils.get_mnist()
+
+        _, _, _, all_test_labels = get_iters(mnist, batch_size)
+
+        trained_lenet = train_lenet5(num_epochs, batch_size,
+                                    *get_iters(mnist, batch_size)[:-1])
+        trained_lenet.save_checkpoint(model_name, num_epochs)
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
new file mode 100644
index 000000000000..4fdd522341bc
--- /dev/null
+++ b/tests/python/tensorrt/test_cvnets.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import gc
+import gluoncv
+import mxnet as mx
+import numpy as np
+
+from mxnet import gluon
+from time import time
+
+from mxnet.gluon.data.vision import transforms
+
+
+def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
+    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
+    h, w = 32, 32
+    net = gluoncv.model_zoo.get_model(model_name, pretrained=True)
+    data = mx.sym.var('data')
+
+    if use_tensorrt:
+        out = net(data)
+        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+        all_params = dict([(k, v.data()) for k, v in net.collect_params().items()])
+        executor = mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
+                                                     data=(batch_size,3, h, w),
+                                                     softmax_label=(batch_size,), grad_req='null',
+                                                     force_rebind=True)
+    else:
+        # Convert gluon model to Symbolic
+        net.hybridize()
+        net.forward(mx.ndarray.zeros((batch_size, 3, h, w)))
+        net.export(model_name)
+        symbol, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
+        executor = symbol.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
+                                      softmax_label=(batch_size,))
+        executor.copy_params_from(arg_params, aux_params)
+    return executor
+
+
+def cifar10_infer(model_name, use_tensorrt, num_workers, ctx=mx.gpu(0), batch_size=128):
+    executor = get_classif_model(model_name, use_tensorrt, ctx, batch_size)
+
+    num_ex = 10000
+    all_preds = np.zeros([num_ex, 10])
+
+    all_label_test = np.zeros(num_ex)
+
+    transform_test = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
+    ])
+
+    data_loader = lambda: gluon.data.DataLoader(
+        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
+        batch_size=batch_size, shuffle=False, num_workers=num_workers)
+
+    val_data = data_loader()
+
+    for idx, (data, label) in enumerate(val_data):
+        # Skip last batch if it's undersized.
+        if data.shape[0] < batch_size:
+            continue
+        offset = idx * batch_size
+        all_label_test[offset:offset + batch_size] = label.asnumpy()
+
+        # warm-up, but don't use result
+        executor.forward(is_train=False, data=data)
+        executor.outputs[0].wait_to_read()
+
+    gc.collect()
+    val_data = data_loader()
+    example_ct = 0
+    start = time()
+
+    # if use_tensorrt:
+    for idx, (data, label) in enumerate(val_data):
+        # Skip last batch if it's undersized.
+        if data.shape[0] < batch_size:
+            continue
+        executor.forward(is_train=False, data=data)
+        preds = executor.outputs[0].asnumpy()
+        offset = idx * batch_size
+        all_preds[offset:offset + batch_size, :] = preds[:batch_size]
+        example_ct += batch_size
+
+    all_preds = np.argmax(all_preds, axis=1)
+    matches = (all_preds[:example_ct] == all_label_test[:example_ct]).sum()
+    duration = time() - start
+
+    return duration, 100.0 * matches / example_ct
+
+
+def run_experiment_for(model_name, batch_size, num_workers):
+    print("\n===========================================")
+    print("Model: %s" % model_name)
+    print("===========================================")
+    print("*** Running inference using pure MXNet ***\n")
+    mx_duration, mx_pct = cifar10_infer(model_name=model_name, batch_size=batch_size,
+                                        num_workers=num_workers, use_tensorrt=False)
+    print("\nMXNet: time elapsed: %.3fs, accuracy: %.2f%%" % (mx_duration, mx_pct))
+    print("\n*** Running inference using MXNet + TensorRT ***\n")
+    trt_duration, trt_pct = cifar10_infer(model_name=model_name, batch_size=batch_size,
+                                          num_workers=num_workers, use_tensorrt=True)
+    print("TensorRT: time elapsed: %.3fs, accuracy: %.2f%%" % (trt_duration, trt_pct))
+    speedup = mx_duration / trt_duration
+    print("TensorRT speed-up (not counting compilation): %.2fx" % speedup)
+
+    acc_diff = abs(mx_pct - trt_pct)
+    print("Absolute accuracy difference: %f" % acc_diff)
+    return speedup, acc_diff
+
+
+def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
+    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
+    try:
+        models = [
+            'cifar_resnet20_v1',
+            'cifar_resnet56_v1',
+            'cifar_resnet110_v1',
+            'cifar_resnet20_v2',
+            'cifar_resnet56_v2',
+            'cifar_resnet110_v2',
+            'cifar_wideresnet16_10',
+            'cifar_wideresnet28_10',
+            'cifar_wideresnet40_8',
+            'cifar_resnext29_16x64d'
+        ]
+
+        num_models = len(models)
+
+        speedups = np.zeros(num_models, dtype=np.float32)
+        acc_diffs = np.zeros(num_models, dtype=np.float32)
+
+        test_start = time()
+
+        for idx, model in enumerate(models):
+            speedup, acc_diff = run_experiment_for(model, batch_size, num_workers)
+            speedups[idx] = speedup
+            acc_diffs[idx] = acc_diff
+            assert acc_diff < tolerance, "Accuracy difference between MXNet and TensorRT > %.2f%% for model %s" % (
+                tolerance, model)
+
+        print("Perf and correctness checks run on the following models:")
+        print(models)
+        mean_speedup = np.mean(speedups)
+        std_speedup = np.std(speedups)
+        print("\nSpeedups:")
+        print(speedups)
+        print("Speedup range: [%.2f, %.2f]" % (np.min(speedups), np.max(speedups)))
+        print("Mean speedup: %.2f" % mean_speedup)
+        print("St. dev. of speedups: %.2f" % std_speedup)
+        print("\nAcc. differences: %s" % str(acc_diffs))
+
+        test_duration = time() - test_start
+
+        print("Test duration: %.2f seconds" % test_duration)
+    finally:
+        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+
+
+if __name__ == '__main__':
+    import nose
+
+    nose.runmodule()
diff --git a/tests/python/tensorrt/test_cycle.py b/tests/python/tensorrt/test_cycle.py
new file mode 100644
index 000000000000..25f515a106a6
--- /dev/null
+++ b/tests/python/tensorrt/test_cycle.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from common import *
+
+
+def detect_cycle_from(sym, visited, stack):
+    visited.add(sym.handle.value)
+    stack.add(sym.handle.value)
+    for s in sym.get_children():
+        if s.handle.value not in visited:
+            if detect_cycle_from(sym, visited, stack):
+                return True
+        elif s.handle.value in stack:
+            return True
+        stack.remove(sym.handle.value)
+    return False
+
+
+def has_no_cycle(sym):
+    visited = set()
+    stack = set()
+    all_nodes = sym.get_internals()
+    for s in all_nodes:
+        if s.handle.value in visited:
+            if detect_cycle_from(s, visited, stack):
+                return False
+    return True
+
+
+def test_simple_cycle():
+    inp = mx.sym.Variable('input', shape=[1,10])
+    A = mx.sym.FullyConnected(data=inp, num_hidden=10, no_bias=False, name='A')
+    B = mx.sym.FullyConnected(data=A, num_hidden=10, no_bias=False, name='B')
+    D = mx.sym.sin(data=A, name='D')
+    C = mx.sym.elemwise_add(lhs=B, rhs=D, name='C')
+    arg_params = {
+                'I_weight': mx.nd.zeros([10,10]),
+                'I_bias': mx.nd.zeros([10]),
+                'A_weight': mx.nd.zeros([10,10]),
+                'A_bias': mx.nd.zeros([10]),
+                'B_weight': mx.nd.zeros([10,10]),
+                'B_bias': mx.nd.zeros([10]),
+               }
+
+    executor = C.simple_bind(ctx=mx.gpu(0), data=(1,10), softmax_label=(1,),
+                           shared_buffer=arg_params, grad_req='null', force_rebind=True)
+    optimized_graph = mx.contrib.tensorrt.get_optimized_symbol(executor)
+    assert has_no_cycle(optimized_graph), "The graph optimized by TRT contains a cycle"
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/tensorrt/test_tensorrt_lenet5.py b/tests/python/tensorrt/test_tensorrt_lenet5.py
new file mode 100644
index 000000000000..258686428a45
--- /dev/null
+++ b/tests/python/tensorrt/test_tensorrt_lenet5.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import numpy as np
+import mxnet as mx
+from common import *
+from lenet5_common import get_iters
+
+
+def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_size, use_tensorrt):
+    """Run inference with either MXNet or TensorRT"""
+    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
+
+    data_size = (batch_size,) + mnist['test_data'].shape[1:]
+    if use_tensorrt:
+        all_params = merge_dicts(arg_params, aux_params)
+        executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(0), all_params=all_params,
+                                                     data=data_size,
+                                                     softmax_label=(batch_size,),
+                                                     grad_req='null',
+                                                     force_rebind=True)
+    else:
+        executor = sym.simple_bind(ctx=mx.gpu(0),
+                                   data=data_size,
+                                   softmax_label=(batch_size,),
+                                   grad_req='null',
+                                   force_rebind=True)
+        executor.copy_params_from(arg_params, aux_params)
+
+    # Get this value from all_test_labels
+    # Also get classes from the dataset
+    num_ex = 10000
+    all_preds = np.zeros([num_ex, 10])
+    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
+
+    example_ct = 0
+
+    for idx, dbatch in enumerate(test_iter):
+        executor.arg_dict["data"][:] = dbatch.data[0]
+        executor.forward(is_train=False)
+        offset = idx*batch_size
+        extent = batch_size if num_ex - offset > batch_size else num_ex - offset
+        all_preds[offset:offset+extent, :] = executor.outputs[0].asnumpy()[:extent]
+        example_ct += extent
+
+    all_preds = np.argmax(all_preds, axis=1)
+    matches = (all_preds[:example_ct] == all_test_labels[:example_ct]).sum()
+
+    percentage = 100.0 * matches / example_ct
+
+    return percentage
+
+
+def test_tensorrt_inference():
+    """Run LeNet-5 inference comparison between MXNet and TensorRT."""
+    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
+    try:
+        check_tensorrt_installation()
+        mnist = mx.test_utils.get_mnist()
+        num_epochs = 10
+        batch_size = 128
+        model_name = 'lenet5'
+        model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
+        model_file = '%s/%s-symbol.json' % (model_dir, model_name)
+        params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
+
+        _, _, _, all_test_labels = get_iters(mnist, batch_size)
+
+        # Load serialized MXNet model (model-symbol.json + model-epoch.params)
+        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
+
+        print("LeNet-5 test")
+        print("Running inference in MXNet")
+        mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                               batch_size=batch_size, use_tensorrt=False)
+
+        print("Running inference in MXNet-TensorRT")
+        trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                                batch_size=batch_size, use_tensorrt=True)
+
+        print("MXNet accuracy: %f" % mx_pct)
+        print("MXNet-TensorRT accuracy: %f" % trt_pct)
+
+        assert abs(mx_pct - trt_pct) < 1e-2, \
+            """Diff. between MXNet & TensorRT accuracy too high:
+               MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
+    finally:
+        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/tensorrt/test_training_warning.py b/tests/python/tensorrt/test_training_warning.py
new file mode 100644
index 000000000000..fdac859aef6f
--- /dev/null
+++ b/tests/python/tensorrt/test_training_warning.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import gluoncv
+import mxnet as mx
+
+from tests.python.unittest.common import assertRaises
+
+
+def test_training_without_trt():
+    run_resnet(is_train=True, use_tensorrt=False)
+
+
+def test_inference_without_trt():
+    run_resnet(is_train=False, use_tensorrt=False)
+
+
+def test_training_with_trt():
+    assertRaises(RuntimeError, run_resnet, is_train=True, use_tensorrt=True)
+
+
+def test_inference_with_trt():
+    run_resnet(is_train=False, use_tensorrt=True)
+
+
+def run_resnet(is_train, use_tensorrt):
+    original_trt_value = mx.contrib.tensorrt.get_use_tensorrt()
+    try:
+        mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
+        ctx = mx.gpu(0)
+        batch_size = 1
+        h = 32
+        w = 32
+        model_name = 'cifar_resnet20_v1'
+        resnet = gluoncv.model_zoo.get_model(model_name, pretrained=True)
+        data = mx.sym.var('data')
+        out = resnet(data)
+        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+        if is_train:
+            grad_req = 'write'
+        else:
+            grad_req = 'null'
+        if use_tensorrt:
+            all_params = dict([(k, v.data()) for k, v in resnet.collect_params().items()])
+            mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
+                                              data=(batch_size, 3, h, w), softmax_label=(batch_size,),
+                                              force_rebind=True, grad_req=grad_req)
+        else:
+            softmax.simple_bind(ctx=ctx, data=(batch_size, 3, h, w), softmax_label=(batch_size,),
+                                force_rebind=True, grad_req=grad_req)
+    finally:
+        mx.contrib.tensorrt.set_use_tensorrt(original_trt_value)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index f1188b53d814..a4b794c95951 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -1664,6 +1664,107 @@ def test_foreach_rnn():
         check_foreach_rnn(cell_type, num_states)
 
 
+@with_seed()
+def test_cut_subgraph_foreach():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self, prefix=None, params=None):
+            super(TestLayer, self).__init__(prefix=prefix, params=params)
+
+        def hybrid_forward(self, F, inputs, states):
+            def step1(data, states):
+                return data + 1, states
+            out1, states1 = F.contrib.foreach(step1, inputs, states)
+            out2, states2 = F.contrib.foreach(step1, out1, states)
+            def step2(data, states):
+                return data + states[0], states1
+            out, states = F.contrib.foreach(step2, out2, states)
+            return out
+
+    data = mx.nd.normal(loc=0, scale=1, shape=(5, 10))
+    states = mx.nd.normal(loc=0, scale=1, shape=(10))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data, [states])
+
+    with mx.autograd.record():
+        res1 = layer(data, [states])
+
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data, [states])
+
+    with mx.autograd.record():
+        res2 = layer(data, [states])
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+
+
+@with_seed()
+def test_cut_subgraph_while_loop():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self, prefix=None, params=None):
+            super(TestLayer, self).__init__(prefix=prefix, params=params)
+        def hybrid_forward(self, F, data):
+            out1, data1 = F.contrib.while_loop(
+                cond=lambda i: i <= 5,
+                func=lambda i: (None, (i + 1, )),
+                loop_vars=(data, ),
+                max_iterations=10,
+            )
+            out2, data2 = F.contrib.while_loop(
+                cond=lambda i: data1[0],
+                func=lambda i: (None, (i + 1, )),
+                loop_vars=data1[0],
+                max_iterations=10,
+            )
+            return data2[0]
+    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data)
+    with mx.autograd.record():
+        res1 = layer(data)
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data)
+    with mx.autograd.record():
+        res2 = layer(data)
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+
+
+@with_seed()
+def test_cut_subgraph_cond():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self, prefix=None, params=None):
+            super(TestLayer, self).__init__(prefix=prefix, params=params)
+        def hybrid_forward(self, F, data):
+            (data1, ) = F.contrib.cond(
+                data > 0.5,
+                then_func=lambda: data * 2,
+                else_func=lambda: data * 3,
+            )
+            (data2, ) = F.contrib.cond(
+                data1 > 0.5,
+                then_func=lambda: data1 * 2,
+                else_func=lambda: data1 * 3,
+            )
+            return data2
+    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data)
+    with mx.autograd.record():
+        res1 = layer(data)
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data)
+    with mx.autograd.record():
+        res2 = layer(data)
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 3117f6646481..2bc696fd4e43 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -72,7 +72,7 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
     assert_almost_equal(rhs_grad.asnumpy(), rhs_grad2, rtol=1e-5, atol=1e-5)
 
 
-@with_seed(0)
+@with_seed()
 def test_bind():
     def check_bind(disable_bulk_exec):
         if disable_bulk_exec:
@@ -97,11 +97,11 @@ def check_bind(disable_bulk_exec):
                                         dim)
 
                 check_bind_with_uniform(lambda x, y: np.maximum(x, y),
-                                        lambda g, x, y: (g * (x>y), g * (y>x)),
+                                        lambda g, x, y: (g * (x>=y), g * (y>x)),
                                         dim,
                                         sf=mx.symbol.maximum)
                 check_bind_with_uniform(lambda x, y: np.minimum(x, y),
-                                        lambda g, x, y: (g * (x<y), g * (y<x)),
+                                        lambda g, x, y: (g * (x<=y), g * (y<x)),
                                         dim,
                                         sf=mx.symbol.minimum)
         if disable_bulk_exec:
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 043804487b5e..53ce600629c8 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -116,6 +116,13 @@ def test_image_folder_dataset():
     assert dataset.synsets == ['test_images']
     assert len(dataset.items) == 16
 
+@with_seed()
+def test_list_dataset():
+    for num_worker in range(0, 3):
+        data = mx.gluon.data.DataLoader([([1,2], 0), ([3, 4], 1)], batch_size=1, num_workers=num_worker)
+        for d, l in data:
+            pass
+
 
 class Dataset(gluon.data.Dataset):
     def __len__(self):
@@ -130,99 +137,89 @@ def test_multi_worker():
     for i, batch in enumerate(loader):
         assert (batch.asnumpy() == i).all()
 
-@with_seed()
-def test_multi_worker_forked_data_loader():
+class _Dummy(Dataset):
+    """Dummy dataset for randomized shape arrays."""
+    def __init__(self, random_shape):
+        self.random_shape = random_shape
+
+    def __getitem__(self, idx):
+        key = idx
+        if self.random_shape:
+            out = np.random.uniform(size=(random.randint(1000, 1100), 40))
+            labels = np.random.uniform(size=(random.randint(10, 15)))
+        else:
+            out = np.random.uniform(size=(1000, 40))
+            labels = np.random.uniform(size=(10))
+        return key, out, labels
+
+    def __len__(self):
+        return 50
+
+def _batchify_list(data):
     """
-    Test should successfully run its course of multi-process/forked data loader without errors
+    return list of ndarray without stack/concat/pad
     """
-    class Dummy(Dataset):
-        def __init__(self, random_shape):
-            self.random_shape = random_shape
-
-        def __getitem__(self, idx):
-            key = idx
-            if self.random_shape:
-                out = np.random.uniform(size=(random.randint(1000, 1100), 40))
-                labels = np.random.uniform(size=(random.randint(10, 15)))
-            else:
-                out = np.random.uniform(size=(1000, 40))
-                labels = np.random.uniform(size=(10))
-            return key, out, labels
-
-        def __len__(self):
-            return 50
-
-        def batchify_list(self, data):
-            """
-            return list of ndarray without stack/concat/pad
-            """
-            if isinstance(data, (tuple, list)):
-                return list(data)
-            if isinstance(data, mx.nd.NDArray):
-                return [data]
-            return data
-
-        def batchify(self, data):
-            """
-            Collate data into batch. Use shared memory for stacking.
-
-            :param data: a list of array, with layout of 'NTC'.
-            :return either x  and x's unpadded lengths, or x, x's unpadded lengths, y and y's unpadded lengths
-                    if labels are not supplied.
-            """
-
-            # input layout is NTC
-            keys, inputs, labels = [item[0] for item in data], [item[1] for item in data], \
-                                   [item[2] for item in data]
-
-            if len(data) > 1:
-                max_data_len = max([seq.shape[0] for seq in inputs])
-                max_labels_len = 0 if not labels else max([seq.shape[0] for seq in labels])
-            else:
-                max_data_len = inputs[0].shape[0]
-                max_labels_len = 0 if not labels else labels[0].shape[0]
-
-            x_lens = [item.shape[0] for item in inputs]
-            y_lens = [item.shape[0] for item in labels]
-
-            for i, seq in enumerate(inputs):
-                pad_len = max_data_len - seq.shape[0]
-                inputs[i] = np.pad(seq, ((0, pad_len), (0, 0)), 'constant', constant_values=0)
-                labels[i] = np.pad(labels[i], (0, max_labels_len - labels[i].shape[0]),
-                                   'constant', constant_values=-1)
-
-            inputs = np.asarray(inputs, dtype=np.float32)
-            if labels is not None:
-                labels = np.asarray(labels, dtype=np.float32)
-            inputs = inputs.transpose((1, 0, 2))
-            labels = labels.transpose((1, 0))
-
-            return (nd.array(inputs, dtype=inputs.dtype, ctx=context.Context('cpu_shared', 0)),
-                    nd.array(x_lens, ctx=context.Context('cpu_shared', 0))) \
-                if labels is None else (
-                nd.array(inputs, dtype=inputs.dtype, ctx=context.Context('cpu_shared', 0)),
-                nd.array(x_lens, ctx=context.Context('cpu_shared', 0)),
-                nd.array(labels, dtype=labels.dtype, ctx=context.Context('cpu_shared', 0)),
-                nd.array(y_lens, ctx=context.Context('cpu_shared', 0)))
+    if isinstance(data, (tuple, list)):
+        return list(data)
+    if isinstance(data, mx.nd.NDArray):
+        return [data]
+    return data
 
+def _batchify(data):
+    """
+    Collate data into batch. Use shared memory for stacking.
+    :param data: a list of array, with layout of 'NTC'.
+    :return either x  and x's unpadded lengths, or x, x's unpadded lengths, y and y's unpadded lengths
+            if labels are not supplied.
+    """
 
-    # This test is pointless on Windows because Windows doesn't fork
-    if platform.system() != 'Windows':
-        data = Dummy(True)
-        loader = DataLoader(data, batch_size=40, batchify_fn=data.batchify, num_workers=2)
-        for epoch in range(1):
-            for i, data in enumerate(loader):
-                if i % 100 == 0:
-                    print(data)
-                    print('{}:{}'.format(epoch, i))
-
-        data = Dummy(True)
-        loader = DataLoader(data, batch_size=40, batchify_fn=data.batchify_list, num_workers=2)
-        for epoch in range(1):
-            for i, data in enumerate(loader):
-                if i % 100 == 0:
-                    print(data)
-                    print('{}:{}'.format(epoch, i))
+    # input layout is NTC
+    keys, inputs, labels = [item[0] for item in data], [item[1] for item in data], \
+                           [item[2] for item in data]
+
+    if len(data) > 1:
+        max_data_len = max([seq.shape[0] for seq in inputs])
+        max_labels_len = 0 if not labels else max([seq.shape[0] for seq in labels])
+    else:
+        max_data_len = inputs[0].shape[0]
+        max_labels_len = 0 if not labels else labels[0].shape[0]
+
+    x_lens = [item.shape[0] for item in inputs]
+    y_lens = [item.shape[0] for item in labels]
+
+    for i, seq in enumerate(inputs):
+        pad_len = max_data_len - seq.shape[0]
+        inputs[i] = np.pad(seq, ((0, pad_len), (0, 0)), 'constant', constant_values=0)
+        labels[i] = np.pad(labels[i], (0, max_labels_len - labels[i].shape[0]),
+                           'constant', constant_values=-1)
+
+    inputs = np.asarray(inputs, dtype=np.float32)
+    if labels is not None:
+        labels = np.asarray(labels, dtype=np.float32)
+    inputs = inputs.transpose((1, 0, 2))
+    labels = labels.transpose((1, 0))
+
+    return (nd.array(inputs, dtype=inputs.dtype, ctx=context.Context('cpu_shared', 0)),
+            nd.array(x_lens, ctx=context.Context('cpu_shared', 0))) \
+        if labels is None else (
+        nd.array(inputs, dtype=inputs.dtype, ctx=context.Context('cpu_shared', 0)),
+        nd.array(x_lens, ctx=context.Context('cpu_shared', 0)),
+        nd.array(labels, dtype=labels.dtype, ctx=context.Context('cpu_shared', 0)),
+        nd.array(y_lens, ctx=context.Context('cpu_shared', 0)))
+
+@with_seed()
+def test_multi_worker_forked_data_loader():
+    data = _Dummy(False)
+    loader = DataLoader(data, batch_size=40, batchify_fn=_batchify, num_workers=2)
+    for epoch in range(1):
+        for i, data in enumerate(loader):
+            pass
+
+    data = _Dummy(True)
+    loader = DataLoader(data, batch_size=40, batchify_fn=_batchify_list, num_workers=2)
+    for epoch in range(1):
+        for i, data in enumerate(loader):
+            pass
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 2a34400d60ab..72c01acb2652 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -113,6 +113,24 @@ def test_trainer_save_load():
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
 
+@with_seed()
+def test_trainer_sparse_save_load():
+    x = gluon.Parameter('x', shape=(10, 1), lr_mult=1.0, stype='row_sparse')
+    x.initialize(ctx=[mx.cpu(0)], init='zeros')
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
+    all_rows = mx.nd.arange(0, 10, ctx=mx.cpu(0))
+    with mx.autograd.record():
+        for w in x.list_row_sparse_data(all_rows):
+            y = w * 1
+            y.backward()
+    trainer.step(1)
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
+    trainer.save_states('test_trainer_sparse_save_load.states')
+    trainer.load_states('test_trainer_sparse_save_load.states')
+    x.lr_mult = 2.0
+    # check if parameter dict is correctly associated with optimizer after load_state
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
+
 @with_seed()
 def test_trainer_multi_layer_init():
     class Net(gluon.Block):
@@ -158,23 +176,6 @@ def check_init(ctxes):
     check_init([mx.cpu(1), mx.cpu(2)])
     check_init([mx.cpu(1)])
 
-@with_seed()
-def test_trainer_save_load():
-    x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
-    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
-    trainer.save_states('test_trainer_save_load.states')
-    trainer.load_states('test_trainer_save_load.states')
-    x.lr_mult = 2.0
-    # check if parameter dict is correctly associated with optimizer after load_state
-    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
-
 @with_seed()
 def test_trainer_reset_kv():
     def check_trainer_reset_kv(kv):
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 4dfa69cc1050..ddb06f9052bb 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -309,6 +309,7 @@ def test_DataBatch():
         'DataBatch: data shapes: \[\(2L?, 3L?\), \(7L?, 8L?\)\] label shapes: \[\(4L?, 5L?\)\]', str(batch)))
 
 
+@unittest.skip("Broken test: https://github.com/apache/incubator-mxnet/issues/12139")
 def test_CSVIter():
     def check_CSVIter_synthetic(dtype='float32'):
         cwd = os.getcwd()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 1da6244a4906..24cc747a3087 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -129,7 +129,7 @@ def test_logistic_loss_equal_bce():
     assert_almost_equal(loss_binary(data, label).asnumpy(), loss_bce(data, label).asnumpy())
     assert_almost_equal(loss_signed(data, 2 * label - 1).asnumpy(), loss_bce(data, label).asnumpy())
 
-@with_seed(1234)
+@with_seed()
 def test_kl_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -274,7 +274,7 @@ def test_saveload():
             eval_metric=mx.metric.Loss())
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
-@with_seed(1234)
+@with_seed()
 def test_huber_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -328,7 +328,7 @@ def test_squared_hinge_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
-@with_seed(1234)
+@with_seed()
 def test_triplet_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index a21527a5a4ad..5e60989489f6 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -772,6 +772,8 @@ def test_forward_reshape():
              for_training=False, force_rebind=True)
     assert mod.predict(pred_dataiter).shape == tuple([10, num_class])
 
+@with_seed()
+def test_forward_types():
     #Test forward with other data batch API
     Batch = namedtuple('Batch', ['data'])
     data = mx.sym.Variable('data')
@@ -786,6 +788,18 @@ def test_forward_reshape():
     mod.forward(Batch(data2))
     assert mod.get_outputs()[0].shape == (3, 5)
 
+    #Test forward with other NDArray and np.ndarray inputs
+    data = mx.sym.Variable('data')
+    out = data * 2
+    mod = mx.mod.Module(symbol=out, label_names=None)
+    mod.bind(data_shapes=[('data', (1, 10))])
+    mod.init_params()
+    data1 = mx.nd.ones((1, 10))
+    assert mod.predict(data1).shape == (1, 10)
+    data2 = np.ones((1, 10))
+    assert mod.predict(data1).shape == (1, 10)
+
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 90e85d123d59..f1aec12ccc3e 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -30,7 +30,7 @@
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_disabled, assertRaises
 import unittest
 
-def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req):
+def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e-4):
     dshape = (N, T, I)
     data = mx.sym.Variable('data')
 
@@ -53,18 +53,18 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req):
     # check inference
     mod1.forward(batch, is_train=False)
     mod2.forward(batch, is_train=False)
-    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=1e-2, atol=1e-4)
+    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol)
 
     # check training
     mod1.forward(batch, is_train=True)
     mod2.forward(batch, is_train=True)
-    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=1e-2, atol=1e-4)
+    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol)
 
     dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape)
     mod1.backward(out_grads=[dy])
     mod2.backward(out_grads=[dy])
     if grad_req != 'null':
-        assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=1e-2, atol=1e-4)
+        assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)
     else:
         assert(mod1.get_input_grads()[0] == None)
         assert(mod2.get_input_grads()[0] == None)
@@ -195,9 +195,8 @@ def test_rnnrelu_sym():
     check_rnn_consistency(fused, stack, T, N, I, H, 'add')
     check_rnn_consistency(fused, stack, T, N, I, H, 'null')
 
-
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11410")
 @with_seed()
+@assert_raises_cudnn_disabled()
 def test_rnnrelu_bidirectional():
     T, N, I, H = 5, 20, 200, 200
 
@@ -214,9 +213,9 @@ def test_rnnrelu_bidirectional():
                 mx.rnn.RNNCell(H, activation='relu', prefix='r1_'),
                 output_prefix='bi_rnnrelu_1_'))
 
-    check_rnn_consistency(fused, stack, T, N, I, H, 'write')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'add')
-    check_rnn_consistency(fused, stack, T, N, I, H, 'null')
+    check_rnn_consistency(fused, stack, T, N, I, H, 'write', rtol=1e-2, atol=1e-2)
+    check_rnn_consistency(fused, stack, T, N, I, H, 'add', rtol=1e-2, atol=1e-2)
+    check_rnn_consistency(fused, stack, T, N, I, H, 'null', rtol=1e-2, atol=1e-2)
 
 @with_seed()
 def test_lstm_dropout():
@@ -292,17 +291,17 @@ def check_elementwise_sum_with_shape(shape, n):
     exec1.forward(is_train=True)
     out1 = exec1.outputs[0].asnumpy()
     out = sum(a.asnumpy() for a  in arr)
-    assert_almost_equal(out, out1)
+    assert_almost_equal(out, out1, rtol=1e-5, atol=1e-5)
 
     out_grad = mx.nd.empty(shape)
     out_grad[:] = np.random.uniform(-10, 10, shape)
     # backward
     exec1.backward([out_grad])
     for a in arr_grad:
-        assert_almost_equal(a.asnumpy(), out_grad.asnumpy())
+        assert_almost_equal(a.asnumpy(), out_grad.asnumpy(), rtol=1e-5, atol=1e-5)
 
 
-@with_seed(0)
+@with_seed()
 def test_elementwise_sum():
     nrepeat = 2
     maxdim = 4
@@ -819,6 +818,37 @@ def fprelu_grad(x, y, gamma):
             check_symbolic_backward(y, [xa, gam_full], [np.ones(shape), np.ones(gam_full.shape)],
                                     [g_xa_full, g_gam_full], rtol=rtol, atol=atol, dtype=dtype)
 
+@with_seed()
+def test_selu():
+    alpha = 1.6732632423543772848170429916717
+    lamb = 1.0507009873554804934193349852946
+    def fselu(x):
+        neg_indices = x < 0
+        out = x.copy()
+        out[neg_indices] = alpha * np.expm1(out[neg_indices])
+        return out * lamb
+    def fselu_grad(grad, x, y):
+        neg_indices = x < 0
+        out = np.ones(x.shape).astype(x.dtype)
+        out[neg_indices] = y[neg_indices] + alpha
+        return out * lamb
+
+    shape = (3, 4)
+    x = mx.sym.Variable("x")
+    y = mx.sym.LeakyReLU(data=x, act_type="selu")
+    for dtype in [np.float16, np.float32, np.float64]:
+        xa = np.random.uniform(low=-0.1,high=0.1,size=shape).astype(dtype)
+        eps, rtol, atol = (7.5e-4, 1e-1, 1e-2) if dtype is np.float16 else (1e-4, 1e-2, 1e-4)
+        if dtype is np.float16:
+            xa /= 10.0
+        xa[abs(xa) < eps] = 0.01
+        ya = fselu(xa)
+        ga = fselu_grad(np.ones(shape).astype(dtype), xa, ya)
+        check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
+        check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype)
+        check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype)
+
+
 @with_seed()
 def test_sigmoid():
     def fsigmoid(a):
@@ -2131,6 +2161,59 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
     assert_allclose(exe.grad_arrays[0].asnumpy(), out_grad_npy.reshape((5, 4, 3, 7)))
 
 
+@with_seed()
+def test_reshape_like():
+    def test_reshape_like_new(lhs_shape, rhs_shape, lbeg, lend, rbeg, rend, dst_shape):
+        lhs = mx.sym.Variable("lhs")
+        rhs = mx.sym.Variable("rhs")
+        net = mx.sym.reshape_like(lhs, rhs, lhs_begin=lbeg, lhs_end=lend, rhs_begin=rbeg, rhs_end=rend)
+        js = net.tojson()
+        net = mx.sym.load_json(js)
+        _, output_shape, __ = net.infer_shape(lhs=lhs_shape, rhs=rhs_shape)
+
+        assert output_shape[0] == dst_shape, \
+            'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\
+            %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend))
+
+        lhs_npy = np.random.rand(*lhs_shape)
+        rhs_npy = np.random.rand(*rhs_shape)
+        grad_npy = np.random.rand(*dst_shape)
+
+        exe = net.simple_bind(default_context(), lhs=lhs_shape, rhs=rhs_shape)
+        exe.arg_dict['lhs'][:] = lhs_npy
+        exe.arg_dict['rhs'][:] = rhs_npy
+        exe.forward(is_train=True)
+        assert np.square(exe.outputs[0].asnumpy() - lhs_npy.reshape(dst_shape)).mean() < 1E-7, \
+            'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\
+            %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend))
+        exe.backward(out_grads=mx.nd.array(grad_npy))
+        assert np.square(exe.grad_dict['lhs'].asnumpy() - grad_npy.reshape(lhs_shape)).mean() < 1E-7, \
+            'LHS Shape = %s, RHS Shape = %s, lhs_begin = %s, lhs_end = %s, rhs_begin= %s, rhs_end= %s'\
+            %(str(lhs_shape), str(rhs_shape), str(lbeg), str(lend), str(rbeg), str(rend))
+    # Test new api (Using shape)
+    test_cases = [
+        [(30,), (15,2,4), 0, None, 0, 2, (15,2)],
+        [(30,), (15,2,4), None, 1, None, 2, (15,2)],
+        [(30,7), (15,2,4), 0, 1, 0, 2, (15,2,7)],
+        [(3,5), (1,15,4), 0, 2, 1, 2, (15,)],
+        [(3,5), (1,15,4), 0, None, 1, -1, (15,)],
+        [(30,12), (4,2,2,3), -1, None, 1, None, (30,2,2,3)],
+        [(1,1,7,3,1,1), (81,1,1,21), 1, -1, 1, None, (1,1,1,21,1)]
+    ]
+    # for test_case in test_cases:
+    for test_case in test_cases:
+        test_reshape_like_new(*test_case)
+
+    # Test old api
+    lhs = mx.sym.Variable("lhs")
+    rhs = mx.sym.Variable("rhs")
+    net = mx.sym.reshape_like(lhs, rhs)
+    js = net.tojson()
+    net = mx.sym.load_json(js)
+    _, output_shape, __ = net.infer_shape(lhs=(40, 30), rhs=(30,20,2))
+    assert(output_shape[0] == (30,20,2))
+
+
 @with_seed()
 def test_reduce():
     sample_num = 500
@@ -3107,7 +3190,7 @@ def l2norm(input_data, axis=0, keepdims=True):
     for order in [1, 2]:
         for dtype in [np.float16, np.float32, np.float64]:
             in_data = np.random.uniform(-1, 1, in_shape).astype(dtype)
-            in_data[abs(in_data) < epsilon] = epsilon
+            in_data[abs(in_data) < epsilon] = 2 * epsilon
             for i in range(in_data_dim):
                 norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, keepdims=True)
                 npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
@@ -3121,20 +3204,22 @@ def l2norm(input_data, axis=0, keepdims=True):
                                         atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
                 # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
                 # # check gradient
-                # check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
-                # if i < in_data_dim-1:
-                #     norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
-                #     npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
-                #     npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                #     check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                #                            rtol=1e-2 if dtype is np.float16 else 1e-5,
-                #                            atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                #     check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
-                #                             [npy_out_backward],
-                #                             rtol=1e-2 if dtype is np.float16 else 1e-5,
-                #                             atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                #     # check gradient
-                #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
+                # if dtype is not np.float16:
+                #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
+                if i < in_data_dim-1:
+                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
+                    npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                    npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
+                    check_symbolic_forward(norm_sym, [in_data], [npy_out],
+                                           rtol=1e-2 if dtype is np.float16 else 1e-5,
+                                           atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
+                    check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
+                                            [npy_out_backward],
+                                            rtol=1e-2 if dtype is np.float16 else 1e-5,
+                                            atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
+                    # # check gradient
+                    # if dtype is not np.float16:
+                    #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
 
 
 def test_layer_norm():
@@ -3727,6 +3812,31 @@ def check_output_n_grad(data_shape, idx_shape, axis, mode):
         exe.backward([mx.nd.array(grad_out)])
         assert_almost_equal(exe.grad_dict['a'].asnumpy(), grad_in)
 
+    def check_autograd_req():
+        row_len = 2
+        col_len = 8
+        shape = (row_len, col_len)
+        sc = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype="float32")
+        sc.attach_grad()
+        i = mx.nd.array([0], dtype="int64")
+        j = mx.nd.array([0], dtype="int64")
+        with mx.autograd.record(train_mode=True):
+            xs = []
+            for _ in range(row_len):
+                x_i = []
+                for _ in range(col_len):
+                    x_ij = sc.take(i).squeeze(axis=0).take(j).squeeze(axis=0)
+                    x_i.append(x_ij)
+                    j = j + 1
+                i = i + 1
+                j = j - col_len  # reset j
+                xs.append(mx.nd.stack(*x_i))
+            x = mx.nd.stack(*xs)
+            x = x.sum()
+
+        x.backward()
+        assert_almost_equal(np.ones(sc.grad.shape), sc.grad.asnumpy())
+
     for mode in ['clip', 'wrap']:
         for data_ndim in range(1, 5):
             for idx_ndim in range(1, 4):
@@ -3739,6 +3849,8 @@ def check_output_n_grad(data_shape, idx_shape, axis, mode):
                         idx_shape += (np.random.randint(low=1, high=5), )
                     check_output_n_grad(data_shape, idx_shape, axis, mode)
 
+    check_autograd_req()
+
 
 @with_seed()
 def test_grid_generator():
@@ -5951,8 +6063,7 @@ def finite_diff_binary_op(
 # - Forward: Comparison to NumPy (several dtype)
 # - Backward: Comparison to NumPy (several dtype)
 # - Finite difference tests (only dtype = float64)
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(192837465)
+@with_seed()
 def test_binary_math_operators():
     shape=(9, 10)
     dtype_l = [np.float64, np.float32, np.float16]
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index fdf7d279d9cc..449cdb423466 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -442,7 +442,7 @@ def update(self, index, weight, grad, state):
             tmp = weight32.astype(weight.dtype)
             tmp.copyto(weight)
 
-@with_seed(0)
+@with_seed()
 def test_nag():
     opt1 = PyNAG
     opt2 = mx.optimizer.NAG
@@ -512,7 +512,7 @@ def update(self, index, weight, grad, state):
         prev_v[:] = v_t
         prev_z[:] = z_t
 
-@with_seed(0)
+@with_seed()
 def test_ftml():
     opt1 = PyFTML
     opt2 = mx.optimizer.FTML
@@ -534,7 +534,7 @@ def test_ftml():
                             kwarg.update(cg_option)
                             kwarg.update(rg_option)
                             kwarg.update(wd_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
 
 
 # ADAM
@@ -702,7 +702,7 @@ def update(self, index, weight, grad, state):
         else:
             weight[:] = (1 - lr*(wd+self.wd_lh))*weight - lr*mx.nd.sign(grad)
 
-@with_seed(0)
+@with_seed()
 def test_signum():
     opt1 = PySignum
     opt2 = mx.optimizer.Signum
@@ -943,7 +943,7 @@ def test_ftrl():
         compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
                           np.float32, w_stype='row_sparse', g_stype='row_sparse')
 
-@with_seed(1234)
+@with_seed()
 def test_nadam():
 
     def get_net(num_hidden, flatten=True):
@@ -965,10 +965,10 @@ def get_net(num_hidden, flatten=True):
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.0005, 'wd': 0.0005},
+    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.001, 'wd': 0.0005},
             initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
             optimizer='nadam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.11
 
 # AdaGrad
 class PyAdaGrad(mx.optimizer.Optimizer):
diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
new file mode 100644
index 000000000000..fc2fbf600cbc
--- /dev/null
+++ b/tests/python/unittest/test_predictor.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import sys, os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../../amalgamation/python/"))
+from mxnet_predict import Predictor, load_ndarray_file
+
+import numpy as np
+import mxnet as mx
+import mxnet.ndarray as nd
+from mxnet import gluon
+from mxnet.test_utils import assert_almost_equal
+from common import setup_module, with_seed, teardown
+
+@with_seed()
+def test_predictor():
+    prefix = 'test_predictor_simple_dense'
+    symbol_file = "%s-symbol.json" % prefix
+    param_file = "%s-0000.params" % prefix
+
+    # two inputs with different batch sizes
+    input1 = np.random.uniform(size=(1,3))
+    input2 = np.random.uniform(size=(3,3))
+
+    # define a simple model
+    block = gluon.nn.HybridSequential()
+    block.add(gluon.nn.Dense(7))
+    block.add(gluon.nn.Dense(3))
+    block.hybridize()
+    block.initialize()
+    out1 = block.forward(nd.array(input1))
+    out2 = block.forward(nd.array(input2))
+    block.export(prefix)
+
+    # create a predictor
+    predictor = Predictor(open(symbol_file, "r").read(),
+                      open(param_file, "rb").read(),
+                      {'data':input1.shape})
+
+    # forward and get output
+    predictor.forward(data=input1)
+    predictor_out1 = predictor.get_output(0)
+    assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
+
+    # reshape
+    predictor.reshape({'data':input2.shape})
+    predictor.forward(data=input2)
+    predictor_out2 = predictor.get_output(0)
+    assert_almost_equal(out2.asnumpy(), predictor_out2, rtol=1e-5, atol=1e-6)
+
+    # destroy the predictor
+    del predictor
+
+@with_seed()
+def test_load_ndarray():
+    nd_file = 'test_predictor_load_ndarray.params'
+    a = nd.random.uniform(shape=(7, 3))
+    b = nd.random.uniform(shape=(7,))
+    nd_data = {'a':a, 'b':b}
+    nd.save(nd_file, nd_data)
+
+    # test load_ndarray_file
+    nd_load = load_ndarray_file(open(nd_file, "rb").read())
+    assert(set(nd_data.keys()) == set(nd_load.keys()))
+    for k in nd_data.keys():
+        assert_almost_equal(nd_data[k].asnumpy(), nd_load[k], rtol=1e-5, atol=1e-6)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 43e9608934e3..575fcdd3b4c1 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -447,20 +447,20 @@ def test_uniform_generator():
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
 
 @with_seed()
-@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/9856')
 def test_gamma_generator():
+    success_rate = 0.05
     ctx = mx.context.current_context()
     for dtype in ['float16', 'float32', 'float64']:
         for kappa, theta in [(0.5, 1.0), (1.0, 5.0)]:
             print("ctx=%s, dtype=%s, Shape=%g, Scale=%g:" % (ctx, dtype, kappa, theta))
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.gamma.ppf(x, a=kappa, loc=0, scale=theta), 5)
             generator_mx = lambda x: mx.nd.random.gamma(kappa, theta, shape=x, ctx=ctx, dtype=dtype).asnumpy()
-            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs, success_rate=success_rate)
             generator_mx_same_seed = \
                 lambda x: np.concatenate(
                     [mx.nd.random.gamma(kappa, theta, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
                      for _ in range(10)])
-            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs, success_rate=success_rate)
 
 @with_seed()
 def test_exponential_generator():
@@ -625,6 +625,23 @@ def check_data(a, b):
         for j in range(i+1, num_seeds):
             check_data(data[i],data[j])
 
+@with_seed()
+def test_unique_zipfian_generator():
+    ctx = mx.context.current_context()
+    if ctx.device_type == 'cpu':
+        num_sampled = 8192
+        range_max = 793472
+        batch_size = 4
+        op = mx.nd._internal._sample_unique_zipfian
+        classes, num_trials = op(range_max, shape=(batch_size, num_sampled))
+        for i in range(batch_size):
+            num_trial = num_trials[i].asscalar()
+            # test uniqueness
+            assert np.unique(classes[i].asnumpy()).size == num_sampled
+            # test num trials. reference count obtained from pytorch implementation
+            assert num_trial > 14500
+            assert num_trial < 17000
+
 @with_seed()
 def test_zipfian_generator():
     # dummy true classes
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index f87e98e92126..96b60144134a 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -24,8 +24,15 @@
 # automated test suite.
 # Rules to be in the whitelist:
 # - not a python tutorial
-whitelist = ['c++/basics.md',
+whitelist = ['basic/index.md',
+             'c++/basics.md',
+             'c++/index.md',
+             'embedded/index.md',
              'embedded/wine_detector.md',
+             'gluon/index.md',
+             'nlp/index.md',
+             'onnx/index.md',
+             'python/index.md',
              'r/CallbackFunction.md',
              'r/charRnnModel.md',
              'r/classifyRealImageWithPretrainedModel.md',
@@ -37,9 +44,13 @@
              'r/ndarray.md',
              'r/symbol.md',
              'scala/char_lstm.md',
-             'scala/mnist.md',
              'scala/index.md',
-             'scala/mxnet_scala_on_intellij.md']
+             'scala/mnist.md',
+             'scala/mxnet_scala_on_intellij.md',
+             'sparse/index.md',
+             'speech_recognition/index.md',
+             'unsupervised_learning/index.md',
+             'vision/index.md']
 whitelist_set = set(whitelist)
 
 def test_tutorial_downloadable():
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 2cdb6134a604..25e96ab0fc55 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,6 +32,9 @@
 
 IPYTHON_VERSION = 4  # Pin to ipython version 4.
 TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
+RETRIES = 8
+KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info'
+
 
 def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
     """Run tutorial Jupyter notebook to catch any execution error.
@@ -72,15 +75,28 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
         os.makedirs(working_dir)
     try:
         notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION)
-        # Adding a small delay to allow time for sockets to be freed
-        # stop-gap measure to battle the 1000ms linger of socket hard coded
-        # in the kernel API code
-        time.sleep(1.1)
         if kernel is not None:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
         else:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
-        nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+
+        # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel
+        # taking to long to start, or a port collision, etc.
+        for i in range(RETRIES):
+            try:
+                nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+            except RuntimeError as rte:
+                # We check if the exception has to do with the Jupyter kernel failing to start. If
+                # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal
+                # to inspect the exception message, but necessary for retry logic, as Jupyter client
+                # throws the generic RuntimeError that can be confused with other Runtime errors.
+                if str(rte) != KERNEL_ERROR_MSG:
+                    raise rte
+
+                logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
+                time.sleep(1)
+                continue
+            break
     except Exception as err:
         err_msg = str(err)
         errors.append(err_msg)