Avoid potential race conditions in device_scalar/device_uvector sette…

…rs (#725) There is a subtle problem with `rmm::device_uvector::set_element_async` and `rmm::device_scalar::set_value`. It's common to pass a literal (e.g. 0) to these functions. But these functions accept the parameter by reference, so if you pass a literal, it's possible for the temporary created to store the literal to be destroyed before the `cudaMemcpyAsync` is performed, resulting in a use after free. This PR: 1. Disallows passing literals to these functions by deleting the rvalue reference (`&&`) overload. This is a breaking API change. 2. In `device_scalar`, adds an optimization for `bool` types to always use `cudaMemsetAsync`. 3. In `device_scalar`, add a new method `set_value_zero` for the common case of initialization to zero. Also uses `cudaMemsetAsync`. 4. Improves documentation. These changes will require PRs to fix up some uses in cudf, cuspatial, and cugraph. Authors: - Mark Harris (@harrism) Approvers: - Rong Ou (@rongou) - Conor Hoekstra (@codereport) URL: #725
rapidsai · Mar 18, 2021 · dce33aa · dce33aa
1 parent dc3ca44
commit dce33aa
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 34 deletions.
diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
@@ -52,7 +52,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(
-    cuda_stream_view const &stream,
+    cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
@@ -76,7 +76,7 @@ class device_scalar {
    */
   explicit device_scalar(
     T const &initial_value,
-    cuda_stream_view const &stream      = cuda_stream_view{},
+    cuda_stream_view stream             = cuda_stream_view{},
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
@@ -96,7 +96,7 @@ class device_scalar {
    * @param mr The resource to use for allocating the new `device_scalar`
    */
   device_scalar(device_scalar const &other,
-                cuda_stream_view const &stream      = {},
+                cuda_stream_view stream             = {},
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{other.buffer, stream, mr}
   {
@@ -118,7 +118,7 @@ class device_scalar {
    * @return T The value of the scalar.
    * @param stream CUDA stream on which to perform the copy and synchronize.
    */
-  T value(cuda_stream_view const &stream = cuda_stream_view{}) const
+  T value(cuda_stream_view stream = cuda_stream_view{}) const
   {
     T host_value{};
     _memcpy(&host_value, buffer.data(), stream);
@@ -129,6 +129,9 @@ class device_scalar {
   /**
    * @brief Sets the value of the `device_scalar` to the given `host_value`.
    *
+   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
+   * `host_value` is zero.
+   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -138,8 +141,9 @@ class device_scalar {
    * referenced by `host_value` should not be destroyed or modified until `stream` has been
    * synchronized. Otherwise, behavior is undefined.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
-
+   * @note: This function incurs a host to device memcpy or device memset and should be used
+   * sparingly.
+   *
    * Example:
    * \code{cpp}
    * rmm::device_scalar<int32_t> s;
@@ -155,17 +159,16 @@ class device_scalar {
    * \endcode
    *
    * @throws `rmm::cuda_error` if copying `host_value` to device memory fails.
-   * @throws `rmm::cuda_error` if synchronizing `stream` fails.
    *
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  template <typename Placeholder = void>
-  auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
-    -> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
+  template <typename U = T>
+  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_fundamental<U>::value && not std::is_same<U, bool>::value, void>
   {
-    if (host_value == T{0}) {
-      RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value()));
+    if (host_value == U{0}) {
+      set_value_zero(stream);
     } else {
       _memcpy(buffer.data(), &host_value, stream);
     }
@@ -174,6 +177,49 @@ class device_scalar {
   /**
    * @brief Sets the value of the `device_scalar` to the given `host_value`.
    *
+   * This specialization for `bool` is optimized to always use `cudaMemsetAsync`.
+   *
+   * @note If the stream specified to this function is different from the stream specified
+   * to the constructor, then appropriate dependencies must be inserted between the streams
+   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
+   * this function, otherwise there may be a race condition.
+   *
+   * This function does not synchronize `stream` before returning. `host_value` is passed by value
+   * so a host-side copy may be performed before calling a device memset.
+   *
+   * @note: This function incurs a device memset.
+   *
+   * Example:
+   * \code{cpp}
+   * rmm::device_scalar<bool> s;
+   *
+   * bool v{true};
+   *
+   * // Copies `true` to device storage on `stream`. Does _not_ synchronize
+   * vec.set_value(v, stream);
+   * ...
+   * cudaStreamSynchronize(stream);
+   * // Synchronization is required before `v` can be modified
+   * v = false;
+   * \endcode
+   *
+   * @throws `rmm::cuda_error` if the device memset fails.
+   *
+   * @param host_value The host value which the scalar will be set to (true or false)
+   * @param stream CUDA stream on which to perform the device memset
+   */
+  template <typename U = T>
+  auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_same<U, bool>::value, void>
+  {
+    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value()));
+  }
+
+  /**
+   * @brief Sets the value of the `device_scalar` to the given `host_value`.
+   *
+   * Specialization for non-fundamental types.
+   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -187,16 +233,16 @@ class device_scalar {
 
    * Example:
    * \code{cpp}
-   * rmm::device_scalar<int32_t> s;
+   * rmm::device_scalar<my_type> s;
    *
-   * int v{42};
+   * my_type v{42, "text"};
    *
    * // Copies 42 to device storage on `stream`. Does _not_ synchronize
    * vec.set_value(v, stream);
    * ...
    * cudaStreamSynchronize(stream);
    * // Synchronization is required before `v` can be modified
-   * v = 13;
+   * v.value = 21;
    * \endcode
    *
    * @throws `rmm::cuda_error` if copying `host_value` to device memory fails
@@ -205,13 +251,42 @@ class device_scalar {
    * @param host_value The host value which will be copied to device
    * @param stream CUDA stream on which to perform the copy
    */
-  template <typename Placeholder = void>
-  auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
-    -> std::enable_if_t<not std::is_fundamental<T>::value, Placeholder>
+  template <typename U = T>
+  auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<not std::is_fundamental<U>::value, void>
   {
     _memcpy(buffer.data(), &host_value, stream);
   }
 
+  // Disallow passing literals to set_value to avoid race conditions where the memory holding the
+  // literal can be freed before the async memcpy / memset executes.
+  void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;
+
+  /**
+   * @brief Sets the value of the `device_scalar` to zero.
+   *
+   * Only supported for fundamental types.
+   *
+   * @note If the stream specified to this function is different from the stream specified
+   * to the constructor, then appropriate dependencies must be inserted between the streams
+   * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
+   * this function, otherwise there may be a race condition.
+   *
+   * This function does not synchronize `stream` before returning.
+   *
+   * @note: This function incurs a device memset and should be used sparingly.
+   *
+   * @throws `rmm::cuda_error` if the device memset fails.
+   *
+   * @param stream CUDA stream on which to perform the device memset
+   */
+  template <typename U = T>
+  auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
+    -> std::enable_if_t<std::is_fundamental<U>::value, void>
+  {
+    RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value()));
+  }
+
   /**
    * @brief Returns pointer to object in device memory.
    *
@@ -241,7 +316,7 @@ class device_scalar {
  private:
   rmm::device_buffer buffer{sizeof(T)};
 
-  inline void _memcpy(void *dst, const void *src, cuda_stream_view const &stream) const
+  inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const
   {
     RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value()));
   }

diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
@@ -172,8 +172,8 @@ class device_uvector {
    * Because this function synchronizes the stream `s`, it is safe to destroy or modify the object
    * referenced by `v` after this function has returned.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
-   * @note: This function synchronizes `stream`.
+   * @note This function incurs a host to device memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * Example:
    * \code{cpp}
@@ -211,7 +211,11 @@ class device_uvector {
    * referenced by `v` should not be destroyed or modified until `stream` has been synchronized.
    * Otherwise, behavior is undefined.
    *
-   * @note: This function incurs a host to device memcpy and should be used sparingly.
+   * @note This function incurs a host to device memcpy and should be used sparingly.
+   *
+   * @note Calling this function with a literal or other r-value reference for `v` is disallowed
+   *  to prevent the implementation from asynchronously copying from a literal or other implicit
+   *  temporary after it is deleted or goes out of scope.
    *
    * Example:
    * \code{cpp}
@@ -241,11 +245,17 @@ class device_uvector {
       cudaMemcpyAsync(element_ptr(element_index), &v, sizeof(v), cudaMemcpyDefault, s.value()));
   }
 
+  // We delete the r-value reference overload to prevent asynchronously copying from a literal or
+  // implicit temporary value after it is deleted or goes out of scope.
+  void set_element_async(std::size_t element_index,
+                         value_type const&& v,
+                         cuda_stream_view s) = delete;
+
   /**
    * @brief Returns the specified element from device memory
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
-   * @note: This function synchronizes `stream`.
+   * @note This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if `element_index >= size()`
    *
@@ -267,7 +277,8 @@ class device_uvector {
   /**
    * @brief Returns the first element.
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function incurs a device-to-host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if the vector is empty.
    *
@@ -279,7 +290,8 @@ class device_uvector {
   /**
    * @brief Returns the last element.
    *
-   * @note: This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function incurs a device-to-host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
    *
    * @throws rmm::out_of_range exception if the vector is empty.
    *

diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp
@@ -25,20 +25,43 @@
 #include <chrono>
 #include <cstddef>
 #include <random>
+#include <type_traits>
 
 template <typename T>
 struct DeviceScalarTest : public ::testing::Test {
+  T value{};
   rmm::cuda_stream stream{};
   rmm::mr::device_memory_resource* mr{rmm::mr::get_current_device_resource()};
-  T value{};
   std::default_random_engine generator{};
-  std::uniform_int_distribution<T> distribution{std::numeric_limits<T>::lowest(),
-                                                std::numeric_limits<T>::max()};
 
-  DeviceScalarTest() { value = distribution(generator); }
+  DeviceScalarTest() { value = random_value(); }
+
+  template <typename U = T, std::enable_if_t<std::is_same<U, bool>::value, bool> = true>
+  U random_value()
+  {
+    static std::bernoulli_distribution distribution{};
+    return distribution(generator);
+  }
+
+  template <
+    typename U                                                                               = T,
+    std::enable_if_t<(std::is_integral<U>::value && not std::is_same<U, bool>::value), bool> = true>
+  U random_value()
+  {
+    static std::uniform_int_distribution<U> distribution{std::numeric_limits<T>::lowest(),
+                                                         std::numeric_limits<T>::max()};
+    return distribution(generator);
+  }
+
+  template <typename U = T, std::enable_if_t<std::is_floating_point<U>::value, bool> = true>
+  U random_value()
+  {
+    static std::normal_distribution<U> distribution{100, 20};
+    return distribution(generator);
+  }
 };
 
-using Types = ::testing::Types<int8_t, int16_t, int32_t, int64_t>;
+using Types = ::testing::Types<bool, int8_t, int16_t, int32_t, int64_t, float, double>;
 
 TYPED_TEST_CASE(DeviceScalarTest, Types);
 
@@ -88,7 +111,7 @@ TYPED_TEST(DeviceScalarTest, SetValue)
   rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
   EXPECT_NE(nullptr, scalar.data());
 
-  auto expected = this->distribution(this->generator);
+  auto expected = this->random_value();
 
   scalar.set_value(expected, this->stream);
   EXPECT_EQ(expected, scalar.value(this->stream));

diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
@@ -174,8 +174,9 @@ TYPED_TEST(TypedUVectorTest, GetSetElementAsync)
   auto size = 12345;
   rmm::device_uvector<TypeParam> uv(size, this->stream());
   for (std::size_t i = 0; i < uv.size(); ++i) {
-    uv.set_element_async(i, i, this->stream());
-    EXPECT_EQ(static_cast<TypeParam>(i), uv.element(i, this->stream()));
+    auto init = static_cast<TypeParam>(i);
+    uv.set_element_async(i, init, this->stream());
+    EXPECT_EQ(init, uv.element(i, this->stream()));
   }
 }