[Opt] remove cudaHostAllocWriteCombined to avoid exception on some …

…scenarios.
NVIDIA-Merlin · May 25, 2023 · f1aa393 · f1aa393
1 parent dad5c7a
commit f1aa393
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 12 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,15 +55,24 @@ foreach(cuda_arch ${sm})
   list(APPEND cuda_arch_list ${cuda_arch})
   message(STATUS "Assign GPU architecture (sm=${cuda_arch})")
 endforeach()
+
 list(LENGTH cuda_arch_list cuda_arch_list_length)
 if(cuda_arch_list_length EQUAL 0)
   list(APPEND cuda_arch_list "80")
   message(STATUS "Assign default GPU architecture sm=80")
 endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+  add_compile_definitions(CUDA_ERROR_CHECK)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+endif()
+
 foreach(cuda_arch ${cuda_arch_list})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
 endforeach()
 
+message(CMAKE_CUDA_FLAGS="${CMAKE_CUDA_FLAGS}")
+
 include_directories(
   ${PROJECT_SOURCE_DIR}/include
   ${PROJECT_SOURCE_DIR}/tests/googletest/googletest/include

diff --git a/include/merlin/core_kernels.cuh b/include/merlin/core_kernels.cuh
@@ -150,13 +150,20 @@ void initialize_buckets(Table<K, V, M>** table, const size_t start,
       (*table)->remaining_hbm_for_vectors -= slice_real_size;
     } else {
       (*table)->is_pure_hbm = false;
-      CUDA_CHECK(
-          cudaMallocHost(&((*table)->slices[i]), slice_real_size,
-                         cudaHostAllocMapped | cudaHostAllocWriteCombined));
+      CUDA_CHECK(cudaMallocHost(&((*table)->slices[i]), slice_real_size,
+                                cudaHostAllocMapped));
     }
     for (int j = 0; j < num_of_buckets_in_one_slice; j++) {
-      (*table)->buckets[start + num_of_allocated_buckets + j].vectors =
-          (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
+      if ((*table)->is_pure_hbm) {
+        (*table)->buckets[start + num_of_allocated_buckets + j].vectors =
+            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
+      } else {
+        V* h_ptr =
+            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
+        CUDA_CHECK(cudaHostGetDevicePointer(
+            &((*table)->buckets[start + num_of_allocated_buckets + j].vectors),
+            h_ptr, 0));
+      }
     }
     num_of_allocated_buckets += num_of_buckets_in_one_slice;
   }
@@ -201,7 +208,9 @@ size_t get_slice_size(Table<K, V, M>** table) {
   const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;
   size_t slice_size = 0;
 
-  if (max_table_size >= GB(16)) {
+  if (max_table_size >= GB(128)) {
+    slice_size = GB(16);
+  } else if (max_table_size >= GB(16)) {
     slice_size = GB(2);
   } else if (max_table_size >= GB(2)) {
     slice_size = MB(128);

diff --git a/include/merlin_hashtable.cuh b/include/merlin_hashtable.cuh
@@ -115,6 +115,12 @@ using EraseIfPredict = bool (*)(
     const M& threshold  ///< The threshold to compare with the `meta` argument.
 );
 
+#if THRUST_VERSION >= 101600
+static constexpr auto& thrust_par = thrust::cuda::par_nosync;
+#else
+static constexpr auto& thrust_par = thrust::cuda::par;
+#endif
+
 /**
  * A HierarchicalKV hash table is a concurrent and hierarchical hash table that
  * is powered by GPUs and can use HBM and host memory as storage for key-value
@@ -153,12 +159,6 @@ class HashTable {
   using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;
   using HostMemoryPool = MemoryPool<HostAllocator<char>>;
 
-#if THRUST_VERSION >= 101600
-  static constexpr auto thrust_par = thrust::cuda::par_nosync;
-#else
-  static constexpr auto thrust_par = thrust::cuda::par;
-#endif
-
  public:
   /**
    * @brief Default constructor for the hash table class.