From 438d312257fa5bc6b57b8c2b1191e6e61d04a307 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 16 Mar 2022 13:01:45 -0700 Subject: [PATCH] Disable opportunistic reuse in async mr when cuda driver < 11.5 (#993) With https://github.com/NVIDIA/spark-rapids/issues/4710 we found some issues with the async pool that may cause memory errors with older drivers. This was confirmed with the cuda team. For driver version < 11.5, we'll disable `cudaMemPoolReuseAllowOpportunistic`. @abellina Authors: - Rong Ou (https://github.com/rongou) Approvers: - Alessandro Bellina (https://github.com/abellina) - Jake Hemstad (https://github.com/jrhemstad) - Mark Harris (https://github.com/harrism) - Leo Fang (https://github.com/leofang) URL: https://github.com/rapidsai/rmm/pull/993 --- include/rmm/mr/device/cuda_async_memory_resource.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp index 5c0525131..f73497a5e 100644 --- a/include/rmm/mr/device/cuda_async_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp @@ -73,6 +73,18 @@ class cuda_async_memory_resource final : public device_memory_resource { pool_props.location.id = rmm::detail::current_device().value(); RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle_, &pool_props)); + // CUDA drivers before 11.5 have known incompatibilities with the async allocator. + // We'll disable `cudaMemPoolReuseAllowOpportunistic` if cuda driver < 11.5. + // See https://github.com/NVIDIA/spark-rapids/issues/4710. + int driver_version{}; + RMM_CUDA_TRY(cudaDriverGetVersion(&driver_version)); + constexpr auto min_async_version{11050}; + if (driver_version < min_async_version) { + int disabled{0}; + RMM_CUDA_TRY( + cudaMemPoolSetAttribute(cuda_pool_handle_, cudaMemPoolReuseAllowOpportunistic, &disabled)); + } + auto const [free, total] = rmm::detail::available_device_memory(); // Need an l-value to take address to pass to cudaMemPoolSetAttribute