From 7d1ddff91f538fd3d99b102319b928af38aa5b73 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 6 Sep 2024 10:12:35 +0200 Subject: [PATCH 01/11] Implement `cuda::uninitialized_async_buffer` This uninitialized buffer provides a stream ordered allocation of N elements of type T utilitzing a cuda::mr::async_resource to allocate the storage. The buffer takes care of alignment and deallocation of the storage. The user is required to ensure that the lifetime of the memory resource exceeds the lifetime of the buffer. --- .../uninitialized_async_buffer.cuh | 225 ++++++++++++++++++ .../async_memory_resource.cuh | 3 + cudax/include/cuda/experimental/buffer.cuh | 5 +- cudax/test/CMakeLists.txt | 1 + .../containers/uninitialized_async_buffer.cu | 155 ++++++++++++ docs/cudax/container.rst | 3 +- 6 files changed, 389 insertions(+), 3 deletions(-) create mode 100644 cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh create mode 100644 cudax/test/containers/uninitialized_async_buffer.cu diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh new file mode 100644 index 00000000000..cd7339feac3 --- /dev/null +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -0,0 +1,225 @@ +//===----------------------------------------------------------------------===// +// +// Part of the CUDA Toolkit, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H +#define __CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +#include + +#if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ + && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) + +//! @file +//! The \c uninitialized_async_buffer class provides a typed buffer allocated in stream-order from a given memory +//! resource. +namespace cuda::experimental +{ + +//! @rst +//! .. _cudax-containers-uninitialized-async-buffer: +//! +//! Uninitialized stream-ordered type-safe memory storage +//! ------------------------------------------------------ +//! +//! ``uninitialized_async_buffer`` provides a typed buffer allocated in stream order from a given :ref:`async memory +//! resource `. It handles alignment and release of the allocation. +//! The memory is uninitialized, so that a user needs to ensure elements are properly constructed. +//! +//! In addition to being type safe, ``uninitialized_async_buffer`` also takes a set of :ref:`properties +//! ` to ensure that e.g. execution space constraints are checked +//! at compile time. However, only stateless properties can be forwarded. To use a stateful property, +//! implement :ref:`get_property(const uninitialized_async_buffer&, Property) +//! `. +//! +//! .. warning:: +//! +//! ``uninitialized_async_buffer`` stores a reference to the provided memory `memory resource +//! `. It is the user's resposibility to ensure the lifetime of +//! the resource exceeds the lifetime of the buffer. +//! +//! .. warning:: +//! +//! ``uninitialized_async_buffer`` uses `stream-ordered allocation +//! `__. It is the user's +//! resposibility to ensure the lifetime of both the provided async resource and the stream exceed the lifetime of +//! the buffer. +//! +//! @endrst +//! @tparam _T the type to be stored in the buffer +//! @tparam _Properties... The properties the allocated memory satisfies +template +class uninitialized_async_buffer +{ +private: + ::cuda::experimental::mr::async_any_resource<_Properties...> __mr_; + ::cuda::stream_ref __stream_ = {}; + size_t __count_ = 0; + void* __buf_ = nullptr; + + //! @brief Determines the allocation size given the alignment and size of `T` + _CCCL_NODISCARD static constexpr size_t __get_allocation_size(const size_t __count) noexcept + { + constexpr size_t __alignment = alignof(_Tp); + return (__count * sizeof(_Tp) + (__alignment - 1)) & ~(__alignment - 1); + } + + //! @brief Determines the properly aligned start of the buffer given the alignment and size of `T` + _CCCL_NODISCARD constexpr _Tp* __get_data() const noexcept + { + constexpr size_t __alignment = alignof(_Tp); + size_t __space = __get_allocation_size(__count_); + void* __ptr = __buf_; + return reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space)); + } + +public: + using value_type = _Tp; + using reference = _Tp&; + using pointer = _Tp*; + using size_type = size_t; + + //! @brief Constructs a \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements through + //! \p __mr + //! @param __mr The async memory resource to allocate the buffer with. + //! @param __stream The cuda stream used for stream ordered allocation. + //! @param __count The desired size of the buffer. + //! @note Depending on the alignment requirements of `T` the size of the underlying allocation might be larger + //! than `count * sizeof(T)`. Only allocates memory when \p __count > 0 + uninitialized_async_buffer(::cuda::experimental::mr::async_any_resource<_Properties...> __mr, + const ::cuda::stream_ref __stream, + const size_t __count) + : __mr_(_CUDA_VSTD::move(__mr)) + , __stream_(__stream) + , __count_(__count) + , __buf_(__count_ == 0 ? nullptr : __mr_.allocate_async(__get_allocation_size(__count_), __stream_)) + {} + + uninitialized_async_buffer(const uninitialized_async_buffer&) = delete; + uninitialized_async_buffer& operator=(const uninitialized_async_buffer&) = delete; + + //! @brief Move construction + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer(uninitialized_async_buffer&& __other) noexcept + : __mr_(_CUDA_VSTD::move(__other.__mr_)) + , __stream_(_CUDA_VSTD::exchange(__other.__stream_, {})) + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} + + //! @brief Move assignment + //! @param __other Another \c uninitialized_async_buffer + uninitialized_async_buffer& operator=(uninitialized_async_buffer&& __other) noexcept + { + if (this == _CUDA_VSTD::addressof(__other)) + { + return *this; + } + + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + __mr_ = __other.__mr_; + __stream_ = _CUDA_VSTD::exchange(__other.__stream_, {}); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); + return *this; + } + //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer stream ordered on the stream that was + //! used to create the buffer + //! @warning The destructor does not destroy any objects that may or may not reside within the buffer. It is the + //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. + ~uninitialized_async_buffer() + { + if (__buf_) + { + __mr_.deallocate_async(__buf_, __get_allocation_size(__count_), __stream_); + } + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer begin() const noexcept + { + return __get_data(); + } + + //! @brief Returns an aligned pointer to the element following the last element of the buffer. + //! This element acts as a placeholder; attempting to access it results in undefined behavior. + _CCCL_NODISCARD constexpr pointer end() const noexcept + { + return __get_data() + __count_; + } + + //! @brief Returns an aligned pointer to the buffer + _CCCL_NODISCARD constexpr pointer data() const noexcept + { + return __get_data(); + } + + //! @brief Returns the size of the buffer + _CCCL_NODISCARD constexpr size_t size() const noexcept + { + return __count_; + } + + //! @brief Returns a \c async_resource_ref of the resource used to allocate the buffer + _CCCL_NODISCARD _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept + { + return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; + } + + //! @brief Returns the stream used to allocate + _CCCL_NODISCARD constexpr ::cuda::stream_ref get_stream() const noexcept + { + return __stream_; + } + + //! @brief Swaps the contents with those of another \c uninitialized_async_buffer + //! @param __other The other \c uninitialized_async_buffer. + constexpr void swap(uninitialized_async_buffer& __other) noexcept + { + _CUDA_VSTD::swap(__mr_, __other.__mr_); + _CUDA_VSTD::swap(__count_, __other.__count_); + _CUDA_VSTD::swap(__buf_, __other.__buf_); + } + +# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken + //! @brief Forwards the passed properties + _LIBCUDACXX_TEMPLATE(class _Property) + _LIBCUDACXX_REQUIRES((!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::_One_of<_Property, _Properties...>) + friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {} +# endif // DOXYGEN_SHOULD_SKIP_THIS +}; + +template +using uninitialized_async_device_buffer = uninitialized_async_buffer<_Tp, _CUDA_VMR::device_accessible>; + +} // namespace cuda::experimental + +#endif // _CCCL_STD_VER >= 2014 && !_CCCL_COMPILER_MSVC_2017 && LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE + +#endif //__CUDAX__CONTAINERS_UNINITIALIZED_ASYNC_BUFFER_H diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh index fb2326dfab2..551519549be 100644 --- a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh @@ -44,6 +44,9 @@ # if _CCCL_STD_VER >= 2014 +//! @file +//! The \c async_memory_pool class provides an asyncronous memory resource that allocates stream-ordered memory on +//! device namespace cuda::experimental::mr { diff --git a/cudax/include/cuda/experimental/buffer.cuh b/cudax/include/cuda/experimental/buffer.cuh index 93dc454d39c..061b884e29d 100644 --- a/cudax/include/cuda/experimental/buffer.cuh +++ b/cudax/include/cuda/experimental/buffer.cuh @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef __CUDAX_BUFFER -#define __CUDAX_BUFFER +#ifndef __CUDAX_BUFFER__ +#define __CUDAX_BUFFER__ #include @@ -20,6 +20,7 @@ # pragma system_header #endif // no system header +#include #include #endif // __CUDAX_BUFFER diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index d4ec714e07c..a50ab0b1ce4 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -88,6 +88,7 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_catch2_test(test_target containers ${cn_target} containers/uninitialized_buffer.cu + containers/uninitialized_async_buffer.cu ) cudax_add_catch2_test(test_target memory_resource ${cn_target} diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu new file mode 100644 index 00000000000..0fd51acc629 --- /dev/null +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +struct do_not_construct +{ + do_not_construct() + { + CHECK(false); + } +}; + +struct my_property +{ + using value_type = int; +}; +constexpr int get_property(const cuda::experimental::uninitialized_async_buffer&, my_property) +{ + return 42; +} + +TEMPLATE_TEST_CASE( + "uninitialized_async_buffer", "[container]", char, short, int, long, long long, float, double, do_not_construct) +{ + using uninitialized_async_buffer = cuda::experimental::uninitialized_async_buffer; + static_assert(!cuda::std::is_default_constructible::value, ""); + static_assert(!cuda::std::is_copy_constructible::value, ""); + static_assert(!cuda::std::is_copy_assignable::value, ""); + + cuda::experimental::mr::async_memory_resource resource{}; + cuda::experimental::stream stream{}; + + SECTION("construction") + { + { + uninitialized_async_buffer from_stream_count{resource, stream, 42}; + CHECK(from_stream_count.data() != nullptr); + CHECK(from_stream_count.size() == 42); + } + + { + uninitialized_async_buffer input{resource, stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer from_rvalue{cuda::std::move(input)}; + CHECK(from_rvalue.data() == ptr); + CHECK(from_rvalue.size() == 42); + CHECK(from_rvalue.get_stream() == stream); + + // Ensure that we properly reset the input buffer + CHECK(input.data() == nullptr); + CHECK(input.size() == 0); + CHECK(input.get_stream() == cuda::stream_ref{}); + } + } + + SECTION("assignment") + { + static_assert(!cuda::std::is_copy_assignable::value, ""); + + { + cuda::experimental::stream other_stream{}; + uninitialized_async_buffer input{resource, other_stream, 42}; + const TestType* ptr = input.data(); + + uninitialized_async_buffer assign_rvalue{resource, stream, 1337}; + assign_rvalue = cuda::std::move(input); + CHECK(assign_rvalue.data() == ptr); + CHECK(assign_rvalue.size() == 42); + CHECK(assign_rvalue.get_stream() == other_stream); + + // Ensure that we properly reset the input buffer + CHECK(input.data() == nullptr); + CHECK(input.size() == 0); + CHECK(input.get_stream() == cuda::stream_ref{}); + } + + { // Ensure self move assignment doesnt do anything + uninitialized_async_buffer buf{resource, stream, 42}; + const auto* old_ptr = buf.data(); + + buf = cuda::std::move(buf); + CHECK(buf.data() == old_ptr); + CHECK(buf.get_stream() == stream); + CHECK(buf.size() == 42); + } + } + + SECTION("access") + { + uninitialized_async_buffer buf{resource, stream, 42}; + CHECK(buf.data() != nullptr); + CHECK(buf.size() == 42); + CHECK(buf.begin() == buf.data()); + CHECK(buf.end() == buf.begin() + buf.size()); + CHECK(buf.get_stream() == stream); + + CHECK(cuda::std::as_const(buf).data() != nullptr); + CHECK(cuda::std::as_const(buf).size() == 42); + CHECK(cuda::std::as_const(buf).begin() == buf.data()); + CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); + CHECK(cuda::std::as_const(buf).get_stream() == stream); + } + + SECTION("properties") + { + static_assert(cuda::has_property, + cuda::mr::device_accessible>, + ""); + static_assert(cuda::has_property, my_property>, + ""); + } + + SECTION("convertion to span") + { + uninitialized_async_buffer buf{resource, stream, 42}; + const cuda::std::span as_span{buf}; + CHECK(as_span.data() == buf.data()); + CHECK(as_span.size() == 42); + } + + SECTION("Actually use memory") + { + if constexpr (!cuda::std::is_same_v) + { + uninitialized_async_buffer buf{resource, stream, 42}; + stream.wait(); + thrust::fill(thrust::device, buf.begin(), buf.end(), TestType{2}); + const auto res = thrust::reduce(thrust::device, buf.begin(), buf.end(), TestType{0}, thrust::plus()); + CHECK(res == TestType{84}); + } + } +} diff --git a/docs/cudax/container.rst b/docs/cudax/container.rst index 66a2ec6ad8a..968b85f56d5 100644 --- a/docs/cudax/container.rst +++ b/docs/cudax/container.rst @@ -8,6 +8,7 @@ Containers library :maxdepth: 1 ${repo_docs_api_path}/class*uninitialized__buffer* + ${repo_docs_api_path}/class*uninitialized__async__buffer* The headers of the container library provide facilities to store elements on the heap. They are heavily inspired by the C++ `containers library `__ but deviate from the standard provided ones due to different requirements from @@ -21,5 +22,5 @@ annotations are checked by the type system. :header-rows: 0 * - :ref:` ` - - Facilities providing uninitialized *heterogeneous* storage satisfying a set of properties + - Facilities providing uninitialized *heterogeneous* potentially stream ordered storage satisfying a set of properties - cudax 2.7.0 / CCCL 2.7.0 From e92010393953978a02d446d9153e2ad851a05b48 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 11 Sep 2024 08:53:34 +0200 Subject: [PATCH 02/11] Apply documentation fixes Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com> --- .../__container/uninitialized_async_buffer.cuh | 10 +++++----- .../__memory_resource/async_memory_resource.cuh | 3 +-- cudax/test/containers/uninitialized_async_buffer.cu | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index cd7339feac3..387b8be8233 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -102,10 +102,10 @@ public: using pointer = _Tp*; using size_type = size_t; - //! @brief Constructs a \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements through + //! @brief Constructs an \c uninitialized_async_buffer, allocating sufficient storage for \p __count elements using //! \p __mr //! @param __mr The async memory resource to allocate the buffer with. - //! @param __stream The cuda stream used for stream ordered allocation. + //! @param __stream The CUDA stream used for stream-ordered allocation. //! @param __count The desired size of the buffer. //! @note Depending on the alignment requirements of `T` the size of the underlying allocation might be larger //! than `count * sizeof(T)`. Only allocates memory when \p __count > 0 @@ -149,8 +149,8 @@ public: __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); return *this; } - //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer stream ordered on the stream that was - //! used to create the buffer + //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer in stream order on the stream that was + //! used to create the buffer. //! @warning The destructor does not destroy any objects that may or may not reside within the buffer. It is the //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. ~uninitialized_async_buffer() @@ -186,7 +186,7 @@ public: return __count_; } - //! @brief Returns a \c async_resource_ref of the resource used to allocate the buffer + //! @brief Returns an \c async_resource_ref of the resource used to allocate the buffer _CCCL_NODISCARD _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept { return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh index 551519549be..225be26e632 100644 --- a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh @@ -45,8 +45,7 @@ # if _CCCL_STD_VER >= 2014 //! @file -//! The \c async_memory_pool class provides an asyncronous memory resource that allocates stream-ordered memory on -//! device +//! The \c async_memory_pool class provides an asynchronous memory resource that allocates device memory in stream order. namespace cuda::experimental::mr { diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 0fd51acc629..9f48de13991 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -133,7 +133,7 @@ TEMPLATE_TEST_CASE( ""); } - SECTION("convertion to span") + SECTION("conversion to span") { uninitialized_async_buffer buf{resource, stream, 42}; const cuda::std::span as_span{buf}; From a87771922435f60306493e776751a3b26e66285c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 6 Sep 2024 10:12:35 +0200 Subject: [PATCH 03/11] Add a way of changing the passed in stream and integrate into cuda::launch --- .../uninitialized_async_buffer.cuh | 48 ++++++++++++++++--- .../__container/uninitialized_buffer.cuh | 18 +++---- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 387b8be8233..ac8d747f1f6 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -24,11 +24,14 @@ #include #include #include +#include #include +#include #include +#include #include -#include +#include #if _CCCL_STD_VER >= 2014 && !defined(_CCCL_COMPILER_MSVC_2017) \ && defined(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) @@ -93,7 +96,28 @@ private: constexpr size_t __alignment = alignof(_Tp); size_t __space = __get_allocation_size(__count_); void* __ptr = __buf_; - return reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space)); + return _CUDA_VSTD::launder( + reinterpret_cast<_Tp*>(_CUDA_VSTD::align(__alignment, __count_ * sizeof(_Tp), __ptr, __space))); + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch. + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span<_Tp> + __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; + } + + //! @brief Causes the buffer to be treated as a span when passed to cudax::launch + //! @pre The buffer must have the cuda::mr::device_accessible property. + _CCCL_NODISCARD_FRIEND _CUDA_VSTD::span + __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept + { + static_assert(_CUDA_VSTD::_One_of<_CUDA_VMR::device_accessible, _Properties...>, + "The buffer must be device accessible to be passed to `launch`"); + return {__self.__get_data(), __self.size()}; } public: @@ -186,18 +210,30 @@ public: return __count_; } - //! @brief Returns an \c async_resource_ref of the resource used to allocate the buffer - _CCCL_NODISCARD _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept + //! @rst + //! Returns an :ref:`asnyc_resource_ref ` to the resource used + //! to allocate the buffer + //! @endrst + _CCCL_NODISCARD _CUDA_VMR::async_resource_ref<_Properties...> resource() const noexcept { - return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; + return _CUDA_VMR::async_resource_ref<_Properties...>{const_cast(this)->__mr_}; } - //! @brief Returns the stream used to allocate + //! @brief Returns the stored stream _CCCL_NODISCARD constexpr ::cuda::stream_ref get_stream() const noexcept { return __stream_; } + //! @brief Replaces the stored stream + //! @param __stream the new stream + //! @note Synchronizes with the old stream + _CCCL_NODISCARD constexpr void get_stream(::cuda::stream_ref __stream) + { + __stream_.wait(); + __stream_ = __stream; + } + //! @brief Swaps the contents with those of another \c uninitialized_async_buffer //! @param __other The other \c uninitialized_async_buffer. constexpr void swap(uninitialized_async_buffer& __other) noexcept diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 817b9782af9..97bdaa2af9f 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -128,12 +129,9 @@ public: //! @param __other Another \c uninitialized_buffer uninitialized_buffer(uninitialized_buffer&& __other) noexcept : __mr_(_CUDA_VSTD::move(__other.__mr_)) - , __count_(__other.__count_) - , __buf_(__other.__buf_) - { - __other.__count_ = 0; - __other.__buf_ = nullptr; - } + , __count_(_CUDA_VSTD::exchange(__other.__count_, 0)) + , __buf_(_CUDA_VSTD::exchange(__other.__buf_, nullptr)) + {} //! @brief Move assignment //! @param __other Another \c uninitialized_buffer @@ -148,11 +146,9 @@ public: { __mr_.deallocate(__buf_, __get_allocation_size(__count_)); } - __mr_ = _CUDA_VSTD::move(__other.__mr_); - __count_ = __other.__count_; - __buf_ = __other.__buf_; - __other.__count_ = 0; - __other.__buf_ = nullptr; + __mr_ = _CUDA_VSTD::move(__other.__mr_); + __count_ = _CUDA_VSTD::exchange(__other.__count_, 0); + __buf_ = _CUDA_VSTD::exchange(__other.__buf_, nullptr); return *this; } From 315a86a0f5694d36ed70252ee7e01c7a3d243e56 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 16 Sep 2024 14:00:12 +0200 Subject: [PATCH 04/11] Drop warning about lifetimes --- .../experimental/__container/uninitialized_async_buffer.cuh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index ac8d747f1f6..11efb19e212 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -60,12 +60,6 @@ namespace cuda::experimental //! //! .. warning:: //! -//! ``uninitialized_async_buffer`` stores a reference to the provided memory `memory resource -//! `. It is the user's resposibility to ensure the lifetime of -//! the resource exceeds the lifetime of the buffer. -//! -//! .. warning:: -//! //! ``uninitialized_async_buffer`` uses `stream-ordered allocation //! `__. It is the user's //! resposibility to ensure the lifetime of both the provided async resource and the stream exceed the lifetime of From 005881aaf5ca9aa7b1c5da623cd0e7ac1c95660e Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Mon, 16 Sep 2024 14:02:22 +0200 Subject: [PATCH 05/11] Ensure we only synchronize if the streams differ in `set_stream` --- .../__container/uninitialized_async_buffer.cuh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 11efb19e212..a153e591fb5 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -221,11 +221,14 @@ public: //! @brief Replaces the stored stream //! @param __stream the new stream - //! @note Synchronizes with the old stream - _CCCL_NODISCARD constexpr void get_stream(::cuda::stream_ref __stream) + //! @note Always synchronizes with the old stream + constexpr void set_stream(::cuda::stream_ref __stream) { - __stream_.wait(); - __stream_ = __stream; + if (__stream != __stream_) + { + __stream_.wait(); + __stream_ = __stream; + } } //! @brief Swaps the contents with those of another \c uninitialized_async_buffer From 036ab7fae7177b90d0526651c901027fa1cfde81 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 09:18:52 +0200 Subject: [PATCH 06/11] Rename function to `change_stream` and adopt review comments --- .../__container/uninitialized_async_buffer.cuh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index a153e591fb5..6d51222a292 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -220,15 +220,12 @@ public: } //! @brief Replaces the stored stream - //! @param __stream the new stream + //! @param __new_stream the new stream //! @note Always synchronizes with the old stream - constexpr void set_stream(::cuda::stream_ref __stream) + constexpr void change_stream(::cuda::stream_ref __new_stream) { - if (__stream != __stream_) - { - __stream_.wait(); - __stream_ = __stream; - } + __stream_.wait(__new_stream); + __stream_ = __new_stream; } //! @brief Swaps the contents with those of another \c uninitialized_async_buffer From 8779ce6b22662b361f781d4857febb1bb8b3004c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 09:21:19 +0200 Subject: [PATCH 07/11] Move towards `get_resource` --- .../uninitialized_async_buffer.cuh | 2 +- .../__container/uninitialized_buffer.cuh | 2 +- .../containers/uninitialized_async_buffer.cu | 66 ++++++++++--------- cudax/test/containers/uninitialized_buffer.cu | 6 +- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 6d51222a292..0048342c6e3 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -208,7 +208,7 @@ public: //! Returns an :ref:`asnyc_resource_ref ` to the resource used //! to allocate the buffer //! @endrst - _CCCL_NODISCARD _CUDA_VMR::async_resource_ref<_Properties...> resource() const noexcept + _CCCL_NODISCARD _CUDA_VMR::async_resource_ref<_Properties...> get_resource() const noexcept { return _CUDA_VMR::async_resource_ref<_Properties...>{const_cast(this)->__mr_}; } diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh index 97bdaa2af9f..9c88df1d95c 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh @@ -192,7 +192,7 @@ public: //! allocate the buffer //! @endrst _CCCL_EXEC_CHECK_DISABLE - _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> resource() const noexcept + _CCCL_NODISCARD _CCCL_HOST_DEVICE _CUDA_VMR::resource_ref<_Properties...> get_resource() const noexcept { return _CUDA_VMR::resource_ref<_Properties...>{const_cast(this)->__mr_}; } diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 9f48de13991..0de390ae184 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -28,7 +28,7 @@ struct do_not_construct { do_not_construct() { - CHECK(false); + CUDAX_CHECK(false); } }; @@ -56,8 +56,8 @@ TEMPLATE_TEST_CASE( { { uninitialized_async_buffer from_stream_count{resource, stream, 42}; - CHECK(from_stream_count.data() != nullptr); - CHECK(from_stream_count.size() == 42); + CUDAX_CHECK(from_stream_count.data() != nullptr); + CUDAX_CHECK(from_stream_count.size() == 42); } { @@ -65,14 +65,14 @@ TEMPLATE_TEST_CASE( const TestType* ptr = input.data(); uninitialized_async_buffer from_rvalue{cuda::std::move(input)}; - CHECK(from_rvalue.data() == ptr); - CHECK(from_rvalue.size() == 42); - CHECK(from_rvalue.get_stream() == stream); + CUDAX_CHECK(from_rvalue.data() == ptr); + CUDAX_CHECK(from_rvalue.size() == 42); + CUDAX_CHECK(from_rvalue.get_stream() == stream); // Ensure that we properly reset the input buffer - CHECK(input.data() == nullptr); - CHECK(input.size() == 0); - CHECK(input.get_stream() == cuda::stream_ref{}); + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); } } @@ -87,14 +87,14 @@ TEMPLATE_TEST_CASE( uninitialized_async_buffer assign_rvalue{resource, stream, 1337}; assign_rvalue = cuda::std::move(input); - CHECK(assign_rvalue.data() == ptr); - CHECK(assign_rvalue.size() == 42); - CHECK(assign_rvalue.get_stream() == other_stream); + CUDAX_CHECK(assign_rvalue.data() == ptr); + CUDAX_CHECK(assign_rvalue.size() == 42); + CUDAX_CHECK(assign_rvalue.get_stream() == other_stream); // Ensure that we properly reset the input buffer - CHECK(input.data() == nullptr); - CHECK(input.size() == 0); - CHECK(input.get_stream() == cuda::stream_ref{}); + CUDAX_CHECK(input.data() == nullptr); + CUDAX_CHECK(input.size() == 0); + CUDAX_CHECK(input.get_stream() == cuda::stream_ref{}); } { // Ensure self move assignment doesnt do anything @@ -102,26 +102,28 @@ TEMPLATE_TEST_CASE( const auto* old_ptr = buf.data(); buf = cuda::std::move(buf); - CHECK(buf.data() == old_ptr); - CHECK(buf.get_stream() == stream); - CHECK(buf.size() == 42); + CUDAX_CHECK(buf.data() == old_ptr); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.size() == 42); } } SECTION("access") { uninitialized_async_buffer buf{resource, stream, 42}; - CHECK(buf.data() != nullptr); - CHECK(buf.size() == 42); - CHECK(buf.begin() == buf.data()); - CHECK(buf.end() == buf.begin() + buf.size()); - CHECK(buf.get_stream() == stream); - - CHECK(cuda::std::as_const(buf).data() != nullptr); - CHECK(cuda::std::as_const(buf).size() == 42); - CHECK(cuda::std::as_const(buf).begin() == buf.data()); - CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); - CHECK(cuda::std::as_const(buf).get_stream() == stream); + CUDAX_CHECK(buf.data() != nullptr); + CUDAX_CHECK(buf.size() == 42); + CUDAX_CHECK(buf.begin() == buf.data()); + CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); + CUDAX_CHECK(buf.get_stream() == stream); + CUDAX_CHECK(buf.get_resource() == resource); + + CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); + CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); + CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); + CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); + CUDAX_CHECK(cuda::std::as_const(buf).get_stream() == stream); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); } SECTION("properties") @@ -137,8 +139,8 @@ TEMPLATE_TEST_CASE( { uninitialized_async_buffer buf{resource, stream, 42}; const cuda::std::span as_span{buf}; - CHECK(as_span.data() == buf.data()); - CHECK(as_span.size() == 42); + CUDAX_CHECK(as_span.data() == buf.data()); + CUDAX_CHECK(as_span.size() == 42); } SECTION("Actually use memory") @@ -149,7 +151,7 @@ TEMPLATE_TEST_CASE( stream.wait(); thrust::fill(thrust::device, buf.begin(), buf.end(), TestType{2}); const auto res = thrust::reduce(thrust::device, buf.begin(), buf.end(), TestType{0}, thrust::plus()); - CHECK(res == TestType{84}); + CUDAX_CHECK(res == TestType{84}); } } } diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 0386588fea8..a206f978edd 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -101,7 +101,7 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.data() != old_ptr); CUDAX_CHECK(buf.data() == old_input_ptr); CUDAX_CHECK(buf.size() == 42); - CUDAX_CHECK(buf.resource() == other_resource); + CUDAX_CHECK(buf.get_resource() == other_resource); CUDAX_CHECK(input.data() == nullptr); CUDAX_CHECK(input.size() == 0); @@ -124,13 +124,13 @@ TEMPLATE_TEST_CASE( CUDAX_CHECK(buf.size() == 42); CUDAX_CHECK(buf.begin() == buf.data()); CUDAX_CHECK(buf.end() == buf.begin() + buf.size()); - CUDAX_CHECK(buf.resource() == resource); + CUDAX_CHECK(buf.get_resource() == resource); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).size() == 42); CUDAX_CHECK(cuda::std::as_const(buf).begin() == buf.data()); CUDAX_CHECK(cuda::std::as_const(buf).end() == buf.begin() + buf.size()); - CUDAX_CHECK(cuda::std::as_const(buf).resource() == resource); + CUDAX_CHECK(cuda::std::as_const(buf).get_resource() == resource); } SECTION("properties") From 8cdb435be3a8e67b2a3902bb2a2a805809ad81f8 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 10:42:23 +0200 Subject: [PATCH 08/11] There is no `wait(stream)` method on stream_Ref --- .../experimental/__container/uninitialized_async_buffer.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 0048342c6e3..eea30c1b69e 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -224,7 +224,10 @@ public: //! @note Always synchronizes with the old stream constexpr void change_stream(::cuda::stream_ref __new_stream) { - __stream_.wait(__new_stream); + if (__new_stream != __stream_) + { + __stream_.wait(); + } __stream_ = __new_stream; } From 03bae7f48c76cf5d122451bd9627401559d163ca Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 10:43:27 +0200 Subject: [PATCH 09/11] Fix invalid check --- cudax/test/containers/uninitialized_async_buffer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index 0de390ae184..a6157a1b083 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -28,7 +28,7 @@ struct do_not_construct { do_not_construct() { - CUDAX_CHECK(false); + CHECK(false); } }; From 19e7392484b1e0ec5c5b2d24ea14da5cc34b19c2 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 13:56:48 +0200 Subject: [PATCH 10/11] Add missing includes --- cudax/test/containers/uninitialized_async_buffer.cu | 1 + cudax/test/containers/uninitialized_buffer.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/cudax/test/containers/uninitialized_async_buffer.cu b/cudax/test/containers/uninitialized_async_buffer.cu index a6157a1b083..b32d3228274 100644 --- a/cudax/test/containers/uninitialized_async_buffer.cu +++ b/cudax/test/containers/uninitialized_async_buffer.cu @@ -22,6 +22,7 @@ #include #include +#include "testing.cuh" #include struct do_not_construct diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index a206f978edd..c924750a8aa 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -23,6 +23,7 @@ #include #include "testing.cuh" +#include struct do_not_construct { From 6e6582d140ae026fa4bf0b143e0d22a53b3660fa Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 17 Sep 2024 14:29:17 +0200 Subject: [PATCH 11/11] Fix formatting --- .../experimental/__memory_resource/async_memory_resource.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh index 225be26e632..b9fd038dd98 100644 --- a/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh +++ b/cudax/include/cuda/experimental/__memory_resource/async_memory_resource.cuh @@ -45,7 +45,8 @@ # if _CCCL_STD_VER >= 2014 //! @file -//! The \c async_memory_pool class provides an asynchronous memory resource that allocates device memory in stream order. +//! The \c async_memory_pool class provides an asynchronous memory resource that allocates device memory in stream +//! order. namespace cuda::experimental::mr {