apache · jinboci · May 7, 2020 · May 7, 2020 · Jun 9, 2020 · Jun 9, 2020
@@ -719,6 +719,7 @@ build_ubuntu_gpu_mkldnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CPP_PACKAGE=ON \
@@ -732,6 +733,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=OFF \
@@ -746,6 +748,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
@@ -785,6 +788,7 @@ build_ubuntu_gpu_cuda101_cudnn7_make() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
+        USE_TVM_OP=1                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -804,6 +808,7 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
+        USE_TVM_OP=0                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -813,6 +818,23 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     make cython PYTHON=python3
 }
 
+build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
+    set -ex
+    cd /work/build
+    CC=gcc-7 CXX=g++-7 cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=OFF \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CUDNN=ON \
+        -DUSE_MKLDNN=OFF \
+        -DBUILD_CYTHON_MODULES=ON \
+        -DUSE_DIST_KVSTORE=ON \
+        -G Ninja /work/mxnet
+    ninja
+}
+
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
@@ -843,6 +865,7 @@ build_ubuntu_gpu_cmake() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -863,6 +886,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=ON                         \
@@ -877,6 +901,27 @@ build_ubuntu_gpu_cmake_no_rtc() {
     ninja
 }
 
+build_ubuntu_gpu_cmake_no_tvm_op() {
+    set -ex
+    cd /work/build
+    CC=gcc-7 CXX=g++-7 cmake \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=OFF                        \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DBUILD_CYTHON_MODULES=1                \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja
+}
+
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
@@ -899,6 +944,7 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
+        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \

@@ -310,6 +310,20 @@ def compile_unix_full_gpu_mkldnn_cpp_test(lib_name) {
     }]
 }
 
+def compile_unix_full_gpu_no_tvm_op(lib_name) {
+    return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-gpu-no-tvm-op') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
+            utils.pack_lib(lib_name, mx_lib_cpp_examples_no_tvm_op)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_cmake_gpu(lib_name) {
     return ['GPU: CMake': {
       node(NODE_LINUX_CPU) {
@@ -324,6 +338,20 @@ def compile_unix_cmake_gpu(lib_name) {
     }]
 }
 
+def compile_unix_cmake_gpu_no_tvm_op(lib_name) {
+    return ['GPU: CMake TVM_OP OFF': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cmake-gpu-no-tvm-op') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_tvm_op', false)
+            utils.pack_lib(lib_name, mx_cmake_lib_no_tvm_op)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_cmake_gpu_no_rtc(lib_name) {
     return ['GPU: CMake CUDA RTC OFF': {
       node(NODE_LINUX_CPU) {
@@ -338,6 +366,22 @@ def compile_unix_cmake_gpu_no_rtc(lib_name) {
     }]
 }
 
+def test_unix_python3_gpu_no_tvm_op() {
+    return ['Python3: GPU TVM_OP OFF': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/ut-python3-gpu-no-tvm-op') {
+          try {
+            utils.unpack_and_init('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
+            python3_gpu_ut_cython('ubuntu_gpu_cu101')
+            utils.publish_test_coverage()
+          } finally {
+            utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_gpu.xml')
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_tensorrt_gpu(lib_name) {
     return ['TensorRT': {
       node(NODE_LINUX_CPU) {

@@ -42,6 +42,8 @@ core_logic: {
     custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
     custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
     custom_steps.compile_unix_int64_gpu('gpu_int64'),
+    custom_steps.compile_unix_full_gpu_no_tvm_op('gpu_no_tvm_op'),
+    custom_steps.compile_unix_cmake_gpu_no_tvm_op('cmake_gpu_no_tvm_op'),
     custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
     custom_steps.compile_unix_full_gpu_mkldnn_cpp_test('gpu_mkldnn_cpp_test_make')
   ])
@@ -60,6 +62,8 @@ core_logic: {
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
     // custom_steps.test_unix_distributed_kvstore_gpu('gpu'),
     custom_steps.test_unix_byteps_gpu('gpu'),
+    custom_steps.test_unix_python3_gpu_no_tvm_op(),
+    custom_steps.test_unix_capi_cpp_package('gpu_mkldnn_cpp_test_make'),
   ]) 
 }
 ,

diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py
@@ -152,6 +152,8 @@ def get_cuda_arch(arch):
     # we create libtvmop.o first, which gives us chance to link tvm_runtime together with the libtvmop
     # to allow mxnet find external helper functions in libtvm_runtime
     func_binary.save(arguments.target_path + "/libtvmop.o")
+    if len(func_binary.imported_modules):
+        func_binary.imported_modules[0].save(arguments.target_path + "/libtvmop.cubin")
     ld_path = arguments.target_path if arguments.ld_path is None else arguments.ld_path
     create_shared(arguments.target_path + "/libtvmop.so",
                   arguments.target_path + "/libtvmop.o",

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -1363,7 +1363,15 @@ int MXGetVersion(int *out) {
 #if MXNET_USE_TVM_OP
 int MXLoadTVMOp(const char *libpath) {
   API_BEGIN();
-  tvm::runtime::TVMOpModule::Get()->Load(libpath);
+  tvm::runtime::TVMOpModule *global_module =  tvm::runtime::TVMOpModule::Get();
+  global_module->Load(libpath);
+#if MXNET_USE_CUDA
+  std::string libpathstr(libpath);
+  std::string cubinpath = libpathstr.substr(0, libpathstr.size() - 11) + "libtvmop.cubin";
+  tvm::runtime::TVMOpModule cubin_module;
+  cubin_module.Load(cubinpath);
+  global_module->Import(cubin_module);
+#endif
   API_END();
 }
 

diff --git a/src/operator/tvmop/op_module.cc b/src/operator/tvmop/op_module.cc
@@ -46,6 +46,12 @@ void TVMOpModule::Load(const std::string &filepath) {
   *module_ptr_ = module;
 }
 
+void TVMOpModule::Import(const TVMOpModule& module) {
+  CHECK(module_ptr_ != nullptr) << "module_ptr_ is not initialized.";
+  std::lock_guard<std::mutex> lock(mutex_);
+  module_ptr_->Import(*(module.module_ptr_));
+}
+
 PackedFunc GetFunction(const std::shared_ptr<Module> &module,
                        const std::string &op_name,
                        const std::vector<mxnet::TBlob> &args) {

diff --git a/src/operator/tvmop/op_module.h b/src/operator/tvmop/op_module.h
@@ -44,6 +44,8 @@ class TVMOpModule {
   // Load TVM operators binary
   void Load(const std::string& filepath);
 
+  void Import(const TVMOpModule& module);
+
   void Call(const std::string& func_name,
             const mxnet::OpContext& ctx,
             const std::vector<mxnet::TBlob>& args) const;