Skip to content

Commit

Permalink
Avoid potential race conditions in device_scalar/device_uvector sette…
Browse files Browse the repository at this point in the history
…rs (#725)

There is a subtle problem with `rmm::device_uvector::set_element_async` and `rmm::device_scalar::set_value`. It's common to pass a literal (e.g. 0) to these functions. But these functions accept the parameter by reference, so if you pass a literal, it's possible for the temporary created to store the literal to be destroyed before the `cudaMemcpyAsync` is performed, resulting in a use after free.  This PR:

1. Disallows passing literals to these functions by deleting the rvalue reference (`&&`) overload. This is a breaking API change.
2. In `device_scalar`, adds an optimization for `bool` types to always use `cudaMemsetAsync`. 
3. In `device_scalar`, add a new method `set_value_zero` for the common case of initialization to zero. Also uses `cudaMemsetAsync`.
4. Improves documentation.

These changes will require PRs to fix up some uses in cudf, cuspatial, and cugraph.

Authors:
  - Mark Harris (@harrism)

Approvers:
  - Rong Ou (@rongou)
  - Conor Hoekstra (@codereport)

URL: #725
  • Loading branch information
harrism authored Mar 18, 2021
1 parent dc3ca44 commit dce33aa
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 34 deletions.
113 changes: 94 additions & 19 deletions include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class device_scalar {
* @param mr Optional, resource with which to allocate.
*/
explicit device_scalar(
cuda_stream_view const &stream,
cuda_stream_view stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{sizeof(T), stream, mr}
{
Expand All @@ -76,7 +76,7 @@ class device_scalar {
*/
explicit device_scalar(
T const &initial_value,
cuda_stream_view const &stream = cuda_stream_view{},
cuda_stream_view stream = cuda_stream_view{},
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{sizeof(T), stream, mr}
{
Expand All @@ -96,7 +96,7 @@ class device_scalar {
* @param mr The resource to use for allocating the new `device_scalar`
*/
device_scalar(device_scalar const &other,
cuda_stream_view const &stream = {},
cuda_stream_view stream = {},
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
: buffer{other.buffer, stream, mr}
{
Expand All @@ -118,7 +118,7 @@ class device_scalar {
* @return T The value of the scalar.
* @param stream CUDA stream on which to perform the copy and synchronize.
*/
T value(cuda_stream_view const &stream = cuda_stream_view{}) const
T value(cuda_stream_view stream = cuda_stream_view{}) const
{
T host_value{};
_memcpy(&host_value, buffer.data(), stream);
Expand All @@ -129,6 +129,9 @@ class device_scalar {
/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
*
* This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
* `host_value` is zero.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
Expand All @@ -138,8 +141,9 @@ class device_scalar {
* referenced by `host_value` should not be destroyed or modified until `stream` has been
* synchronized. Otherwise, behavior is undefined.
*
* @note: This function incurs a host to device memcpy and should be used sparingly.
* @note: This function incurs a host to device memcpy or device memset and should be used
* sparingly.
*
* Example:
* \code{cpp}
* rmm::device_scalar<int32_t> s;
Expand All @@ -155,17 +159,16 @@ class device_scalar {
* \endcode
*
* @throws `rmm::cuda_error` if copying `host_value` to device memory fails.
* @throws `rmm::cuda_error` if synchronizing `stream` fails.
*
* @param host_value The host value which will be copied to device
* @param stream CUDA stream on which to perform the copy
*/
template <typename Placeholder = void>
auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
-> std::enable_if_t<std::is_fundamental<T>::value, Placeholder>
template <typename U = T>
auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_fundamental<U>::value && not std::is_same<U, bool>::value, void>
{
if (host_value == T{0}) {
RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(T), stream.value()));
if (host_value == U{0}) {
set_value_zero(stream);
} else {
_memcpy(buffer.data(), &host_value, stream);
}
Expand All @@ -174,6 +177,49 @@ class device_scalar {
/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
*
* This specialization for `bool` is optimized to always use `cudaMemsetAsync`.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
* this function, otherwise there may be a race condition.
*
* This function does not synchronize `stream` before returning. `host_value` is passed by value
* so a host-side copy may be performed before calling a device memset.
*
* @note: This function incurs a device memset.
*
* Example:
* \code{cpp}
* rmm::device_scalar<bool> s;
*
* bool v{true};
*
* // Copies `true` to device storage on `stream`. Does _not_ synchronize
* vec.set_value(v, stream);
* ...
* cudaStreamSynchronize(stream);
* // Synchronization is required before `v` can be modified
* v = false;
* \endcode
*
* @throws `rmm::cuda_error` if the device memset fails.
*
* @param host_value The host value which the scalar will be set to (true or false)
* @param stream CUDA stream on which to perform the device memset
*/
template <typename U = T>
auto set_value(U const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_same<U, bool>::value, void>
{
RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), host_value, sizeof(bool), stream.value()));
}

/**
* @brief Sets the value of the `device_scalar` to the given `host_value`.
*
* Specialization for non-fundamental types.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
Expand All @@ -187,16 +233,16 @@ class device_scalar {
* Example:
* \code{cpp}
* rmm::device_scalar<int32_t> s;
* rmm::device_scalar<my_type> s;
*
* int v{42};
* my_type v{42, "text"};
*
* // Copies 42 to device storage on `stream`. Does _not_ synchronize
* vec.set_value(v, stream);
* ...
* cudaStreamSynchronize(stream);
* // Synchronization is required before `v` can be modified
* v = 13;
* v.value = 21;
* \endcode
*
* @throws `rmm::cuda_error` if copying `host_value` to device memory fails
Expand All @@ -205,13 +251,42 @@ class device_scalar {
* @param host_value The host value which will be copied to device
* @param stream CUDA stream on which to perform the copy
*/
template <typename Placeholder = void>
auto set_value(T const &host_value, cuda_stream_view const &stream = cuda_stream_view{})
-> std::enable_if_t<not std::is_fundamental<T>::value, Placeholder>
template <typename U = T>
auto set_value(T const &host_value, cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<not std::is_fundamental<U>::value, void>
{
_memcpy(buffer.data(), &host_value, stream);
}

// Disallow passing literals to set_value to avoid race conditions where the memory holding the
// literal can be freed before the async memcpy / memset executes.
void set_value(T &&host_value, cuda_stream_view stream = cuda_stream_view{}) = delete;

/**
* @brief Sets the value of the `device_scalar` to zero.
*
* Only supported for fundamental types.
*
* @note If the stream specified to this function is different from the stream specified
* to the constructor, then appropriate dependencies must be inserted between the streams
* (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
* this function, otherwise there may be a race condition.
*
* This function does not synchronize `stream` before returning.
*
* @note: This function incurs a device memset and should be used sparingly.
*
* @throws `rmm::cuda_error` if the device memset fails.
*
* @param stream CUDA stream on which to perform the device memset
*/
template <typename U = T>
auto set_value_zero(cuda_stream_view stream = cuda_stream_view{})
-> std::enable_if_t<std::is_fundamental<U>::value, void>
{
RMM_CUDA_TRY(cudaMemsetAsync(buffer.data(), 0, sizeof(U), stream.value()));
}

/**
* @brief Returns pointer to object in device memory.
*
Expand Down Expand Up @@ -241,7 +316,7 @@ class device_scalar {
private:
rmm::device_buffer buffer{sizeof(T)};

inline void _memcpy(void *dst, const void *src, cuda_stream_view const &stream) const
inline void _memcpy(void *dst, const void *src, cuda_stream_view stream) const
{
RMM_CUDA_TRY(cudaMemcpyAsync(dst, src, sizeof(T), cudaMemcpyDefault, stream.value()));
}
Expand Down
26 changes: 19 additions & 7 deletions include/rmm/device_uvector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ class device_uvector {
* Because this function synchronizes the stream `s`, it is safe to destroy or modify the object
* referenced by `v` after this function has returned.
*
* @note: This function incurs a host to device memcpy and should be used sparingly.
* @note: This function synchronizes `stream`.
* @note This function incurs a host to device memcpy and should be used sparingly.
* @note This function synchronizes `stream`.
*
* Example:
* \code{cpp}
Expand Down Expand Up @@ -211,7 +211,11 @@ class device_uvector {
* referenced by `v` should not be destroyed or modified until `stream` has been synchronized.
* Otherwise, behavior is undefined.
*
* @note: This function incurs a host to device memcpy and should be used sparingly.
* @note This function incurs a host to device memcpy and should be used sparingly.
*
* @note Calling this function with a literal or other r-value reference for `v` is disallowed
* to prevent the implementation from asynchronously copying from a literal or other implicit
* temporary after it is deleted or goes out of scope.
*
* Example:
* \code{cpp}
Expand Down Expand Up @@ -241,11 +245,17 @@ class device_uvector {
cudaMemcpyAsync(element_ptr(element_index), &v, sizeof(v), cudaMemcpyDefault, s.value()));
}

// We delete the r-value reference overload to prevent asynchronously copying from a literal or
// implicit temporary value after it is deleted or goes out of scope.
void set_element_async(std::size_t element_index,
value_type const&& v,
cuda_stream_view s) = delete;

/**
* @brief Returns the specified element from device memory
*
* @note: This function incurs a device to host memcpy and should be used sparingly.
* @note: This function synchronizes `stream`.
* @note This function incurs a device to host memcpy and should be used sparingly.
* @note This function synchronizes `stream`.
*
* @throws rmm::out_of_range exception if `element_index >= size()`
*
Expand All @@ -267,7 +277,8 @@ class device_uvector {
/**
* @brief Returns the first element.
*
* @note: This function incurs a device to host memcpy and should be used sparingly.
* @note This function incurs a device-to-host memcpy and should be used sparingly.
* @note This function synchronizes `stream`.
*
* @throws rmm::out_of_range exception if the vector is empty.
*
Expand All @@ -279,7 +290,8 @@ class device_uvector {
/**
* @brief Returns the last element.
*
* @note: This function incurs a device to host memcpy and should be used sparingly.
* @note This function incurs a device-to-host memcpy and should be used sparingly.
* @note This function synchronizes `stream`.
*
* @throws rmm::out_of_range exception if the vector is empty.
*
Expand Down
35 changes: 29 additions & 6 deletions tests/device_scalar_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,43 @@
#include <chrono>
#include <cstddef>
#include <random>
#include <type_traits>

template <typename T>
struct DeviceScalarTest : public ::testing::Test {
T value{};
rmm::cuda_stream stream{};
rmm::mr::device_memory_resource* mr{rmm::mr::get_current_device_resource()};
T value{};
std::default_random_engine generator{};
std::uniform_int_distribution<T> distribution{std::numeric_limits<T>::lowest(),
std::numeric_limits<T>::max()};

DeviceScalarTest() { value = distribution(generator); }
DeviceScalarTest() { value = random_value(); }

template <typename U = T, std::enable_if_t<std::is_same<U, bool>::value, bool> = true>
U random_value()
{
static std::bernoulli_distribution distribution{};
return distribution(generator);
}

template <
typename U = T,
std::enable_if_t<(std::is_integral<U>::value && not std::is_same<U, bool>::value), bool> = true>
U random_value()
{
static std::uniform_int_distribution<U> distribution{std::numeric_limits<T>::lowest(),
std::numeric_limits<T>::max()};
return distribution(generator);
}

template <typename U = T, std::enable_if_t<std::is_floating_point<U>::value, bool> = true>
U random_value()
{
static std::normal_distribution<U> distribution{100, 20};
return distribution(generator);
}
};

using Types = ::testing::Types<int8_t, int16_t, int32_t, int64_t>;
using Types = ::testing::Types<bool, int8_t, int16_t, int32_t, int64_t, float, double>;

TYPED_TEST_CASE(DeviceScalarTest, Types);

Expand Down Expand Up @@ -88,7 +111,7 @@ TYPED_TEST(DeviceScalarTest, SetValue)
rmm::device_scalar<TypeParam> scalar{this->value, this->stream, this->mr};
EXPECT_NE(nullptr, scalar.data());

auto expected = this->distribution(this->generator);
auto expected = this->random_value();

scalar.set_value(expected, this->stream);
EXPECT_EQ(expected, scalar.value(this->stream));
Expand Down
5 changes: 3 additions & 2 deletions tests/device_uvector_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,9 @@ TYPED_TEST(TypedUVectorTest, GetSetElementAsync)
auto size = 12345;
rmm::device_uvector<TypeParam> uv(size, this->stream());
for (std::size_t i = 0; i < uv.size(); ++i) {
uv.set_element_async(i, i, this->stream());
EXPECT_EQ(static_cast<TypeParam>(i), uv.element(i, this->stream()));
auto init = static_cast<TypeParam>(i);
uv.set_element_async(i, init, this->stream());
EXPECT_EQ(init, uv.element(i, this->stream()));
}
}

Expand Down

0 comments on commit dce33aa

Please sign in to comment.