Skip to content

Commit

Permalink
[Opt] remove cudaHostAllocWriteCombined to avoid exception on some …
Browse files Browse the repository at this point in the history
…scenarios.
  • Loading branch information
rhdong committed May 25, 2023
1 parent dad5c7a commit f1aa393
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 12 deletions.
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,24 @@ foreach(cuda_arch ${sm})
list(APPEND cuda_arch_list ${cuda_arch})
message(STATUS "Assign GPU architecture (sm=${cuda_arch})")
endforeach()

list(LENGTH cuda_arch_list cuda_arch_list_length)
if(cuda_arch_list_length EQUAL 0)
list(APPEND cuda_arch_list "80")
message(STATUS "Assign default GPU architecture sm=80")
endif()

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_definitions(CUDA_ERROR_CHECK)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
endif()

foreach(cuda_arch ${cuda_arch_list})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endforeach()

message(CMAKE_CUDA_FLAGS="${CMAKE_CUDA_FLAGS}")

include_directories(
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/tests/googletest/googletest/include
Expand Down
21 changes: 15 additions & 6 deletions include/merlin/core_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,20 @@ void initialize_buckets(Table<K, V, M>** table, const size_t start,
(*table)->remaining_hbm_for_vectors -= slice_real_size;
} else {
(*table)->is_pure_hbm = false;
CUDA_CHECK(
cudaMallocHost(&((*table)->slices[i]), slice_real_size,
cudaHostAllocMapped | cudaHostAllocWriteCombined));
CUDA_CHECK(cudaMallocHost(&((*table)->slices[i]), slice_real_size,
cudaHostAllocMapped));
}
for (int j = 0; j < num_of_buckets_in_one_slice; j++) {
(*table)->buckets[start + num_of_allocated_buckets + j].vectors =
(*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
if ((*table)->is_pure_hbm) {
(*table)->buckets[start + num_of_allocated_buckets + j].vectors =
(*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
} else {
V* h_ptr =
(*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
CUDA_CHECK(cudaHostGetDevicePointer(
&((*table)->buckets[start + num_of_allocated_buckets + j].vectors),
h_ptr, 0));
}
}
num_of_allocated_buckets += num_of_buckets_in_one_slice;
}
Expand Down Expand Up @@ -201,7 +208,9 @@ size_t get_slice_size(Table<K, V, M>** table) {
const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;
size_t slice_size = 0;

if (max_table_size >= GB(16)) {
if (max_table_size >= GB(128)) {
slice_size = GB(16);
} else if (max_table_size >= GB(16)) {
slice_size = GB(2);
} else if (max_table_size >= GB(2)) {
slice_size = MB(128);
Expand Down
12 changes: 6 additions & 6 deletions include/merlin_hashtable.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ using EraseIfPredict = bool (*)(
const M& threshold ///< The threshold to compare with the `meta` argument.
);

#if THRUST_VERSION >= 101600
static constexpr auto& thrust_par = thrust::cuda::par_nosync;
#else
static constexpr auto& thrust_par = thrust::cuda::par;
#endif

/**
* A HierarchicalKV hash table is a concurrent and hierarchical hash table that
* is powered by GPUs and can use HBM and host memory as storage for key-value
Expand Down Expand Up @@ -153,12 +159,6 @@ class HashTable {
using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;
using HostMemoryPool = MemoryPool<HostAllocator<char>>;

#if THRUST_VERSION >= 101600
static constexpr auto thrust_par = thrust::cuda::par_nosync;
#else
static constexpr auto thrust_par = thrust::cuda::par;
#endif

public:
/**
* @brief Default constructor for the hash table class.
Expand Down

0 comments on commit f1aa393

Please sign in to comment.