From 8c20e14110f1c4079a0922ca90c96597ae20350b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 24 Jul 2024 19:44:44 -0500 Subject: [PATCH 1/2] Remove prefetch factory. (#1625) PR #1608 added a prefetch resource adaptor. However, per issue #1616, we want to remove the adaptor factories like `make_prefetch_adaptor` in favor of constructors with CTAD. I am removing the prefetch adaptor factory because it has not yet been released, and thus can be deleted without deprecation. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Rong Ou (https://github.com/rongou) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/rmm/pull/1625 --- .../rmm/mr/device/prefetch_resource_adaptor.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/rmm/mr/device/prefetch_resource_adaptor.hpp b/include/rmm/mr/device/prefetch_resource_adaptor.hpp index da0ca8c86..48a64a5b9 100644 --- a/include/rmm/mr/device/prefetch_resource_adaptor.hpp +++ b/include/rmm/mr/device/prefetch_resource_adaptor.hpp @@ -125,19 +125,5 @@ class prefetch_resource_adaptor final : public device_memory_resource { Upstream* upstream_; // the upstream resource used for satisfying allocation requests }; -/** - * @brief Convenience factory to return a `prefetch_resource_adaptor` around the - * upstream resource `upstream`. - * - * @tparam Upstream Type of the upstream `device_memory_resource`. - * @param upstream Pointer to the upstream resource - * @return The new prefetch resource adaptor - */ -template -prefetch_resource_adaptor make_prefetch_adaptor(Upstream* upstream) -{ - return prefetch_resource_adaptor{upstream}; -} - /** @} */ // end of group } // namespace rmm::mr From 67a78d6370ed34a0ffcb3aa9c77299e2a7ea1ee5 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 24 Jul 2024 18:37:21 -0700 Subject: [PATCH 2/2] Add python wrapper for system memory resource (#1605) Follow up on #1581 to add access to the system memory resource in python. Fixes #1622 Authors: - Rong Ou (https://github.com/rongou) Approvers: - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/rmm/pull/1605 --- ...r.hpp => sam_headroom_memory_resource.hpp} | 61 ++++++------------- python/rmm/rmm/_lib/memory_resource.pxd | 6 ++ python/rmm/rmm/_lib/memory_resource.pyx | 47 ++++++++++++++ python/rmm/rmm/mr.py | 4 ++ python/rmm/rmm/tests/test_rmm.py | 58 +++++++++++++++++- tests/mr/device/system_mr_tests.cu | 39 ++++-------- 6 files changed, 144 insertions(+), 71 deletions(-) rename include/rmm/mr/device/{sam_headroom_resource_adaptor.hpp => sam_headroom_memory_resource.hpp} (62%) diff --git a/include/rmm/mr/device/sam_headroom_resource_adaptor.hpp b/include/rmm/mr/device/sam_headroom_memory_resource.hpp similarity index 62% rename from include/rmm/mr/device/sam_headroom_resource_adaptor.hpp rename to include/rmm/mr/device/sam_headroom_memory_resource.hpp index fc913f290..cdf14181f 100644 --- a/include/rmm/mr/device/sam_headroom_resource_adaptor.hpp +++ b/include/rmm/mr/device/sam_headroom_memory_resource.hpp @@ -23,12 +23,12 @@ namespace rmm::mr { /** - * @addtogroup device_resource_adaptors + * @addtogroup device_memory_resources * @{ * @file */ /** - * @brief Resource that adapts system memory resource to allocate memory with a headroom. + * @brief Resource that uses system memory resource to allocate memory with a headroom. * * System allocated memory (SAM) can be migrated to the GPU, but is never migrated back the host. If * GPU memory is over-subscribed, this can cause other CUDA calls to fail with out-of-memory errors. @@ -39,46 +39,22 @@ namespace rmm::mr { * Since doing this check on every allocation can be expensive, the caller may choose to use other * allocators (e.g. `binning_memory_resource`) for small allocations, and use this allocator for * large allocations only. - * - * @tparam Upstream Type of the upstream resource used for allocation/deallocation. Must be - * `system_memory_resource`. */ -template -class sam_headroom_resource_adaptor final : public device_memory_resource { +class sam_headroom_memory_resource final : public device_memory_resource { public: /** - * @brief Construct a headroom adaptor using `upstream` to satisfy allocation requests. + * @brief Construct a headroom memory resource. * - * @param upstream The resource used for allocating/deallocating device memory. Must be - * `system_memory_resource`. * @param headroom Size of the reserved GPU memory as headroom */ - explicit sam_headroom_resource_adaptor(Upstream* upstream, std::size_t headroom) - : upstream_{upstream}, headroom_{headroom} - { - static_assert(std::is_same_v, - "Upstream must be rmm::mr::system_memory_resource"); - } + explicit sam_headroom_memory_resource(std::size_t headroom) : system_mr_{}, headroom_{headroom} {} - sam_headroom_resource_adaptor() = delete; - ~sam_headroom_resource_adaptor() override = default; - sam_headroom_resource_adaptor(sam_headroom_resource_adaptor const&) = delete; - sam_headroom_resource_adaptor(sam_headroom_resource_adaptor&&) = delete; - sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor const&) = delete; - sam_headroom_resource_adaptor& operator=(sam_headroom_resource_adaptor&&) = delete; - - /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} - */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept - { - return upstream_; - } - - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } + sam_headroom_memory_resource() = delete; + ~sam_headroom_memory_resource() override = default; + sam_headroom_memory_resource(sam_headroom_memory_resource const&) = delete; + sam_headroom_memory_resource(sam_headroom_memory_resource&&) = delete; + sam_headroom_memory_resource& operator=(sam_headroom_memory_resource const&) = delete; + sam_headroom_memory_resource& operator=(sam_headroom_memory_resource&&) = delete; private: /** @@ -94,8 +70,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override { - void* pointer = - get_upstream_resource().allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream); + void* pointer = system_mr_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream); auto const free = rmm::available_device_memory().first; auto const allocatable = free > headroom_ ? free - headroom_ : 0UL; @@ -131,7 +106,7 @@ class sam_headroom_resource_adaptor final : public device_memory_resource { [[maybe_unused]] std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override { - get_upstream_resource().deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream); + system_mr_.deallocate_async(ptr, rmm::CUDA_ALLOCATION_ALIGNMENT, stream); } /** @@ -144,13 +119,15 @@ class sam_headroom_resource_adaptor final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { if (this == &other) { return true; } - auto cast = dynamic_cast(&other); + auto cast = dynamic_cast(&other); if (cast == nullptr) { return false; } - return get_upstream_resource() == cast->get_upstream_resource() && headroom_ == cast->headroom_; + return headroom_ == cast->headroom_; } - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests - std::size_t headroom_; ///< Size of GPU memory reserved as headroom + ///< The system memory resource used for satisfying allocation requests + system_memory_resource system_mr_; + ///< Size of GPU memory reserved as headroom + std::size_t headroom_; }; /** @} */ // end of group } // namespace rmm::mr diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd index 8be2ba158..000a3fe1e 100644 --- a/python/rmm/rmm/_lib/memory_resource.pxd +++ b/python/rmm/rmm/_lib/memory_resource.pxd @@ -52,6 +52,12 @@ cdef class CudaMemoryResource(DeviceMemoryResource): cdef class ManagedMemoryResource(DeviceMemoryResource): pass +cdef class SystemMemoryResource(DeviceMemoryResource): + pass + +cdef class SamHeadroomMemoryResource(DeviceMemoryResource): + pass + cdef class CudaAsyncMemoryResource(DeviceMemoryResource): pass diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index bf927c336..5030c5d2d 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -94,6 +94,16 @@ cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \ cdef cppclass managed_memory_resource(device_memory_resource): managed_memory_resource() except + +cdef extern from "rmm/mr/device/system_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass system_memory_resource(device_memory_resource): + system_memory_resource() except + + +cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \ + namespace "rmm::mr" nogil: + cdef cppclass sam_headroom_memory_resource(device_memory_resource): + sam_headroom_memory_resource(size_t headroom) except + + cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \ namespace "rmm::mr" nogil: @@ -366,6 +376,43 @@ cdef class ManagedMemoryResource(DeviceMemoryResource): pass +cdef class SystemMemoryResource(DeviceMemoryResource): + def __cinit__(self): + self.c_obj.reset( + new system_memory_resource() + ) + + def __init__(self): + """ + Memory resource that uses ``malloc``/``free`` for + allocation/deallocation. + """ + pass + + +cdef class SamHeadroomMemoryResource(DeviceMemoryResource): + def __cinit__( + self, + size_t headroom + ): + self.c_obj.reset(new sam_headroom_memory_resource(headroom)) + + def __init__( + self, + size_t headroom + ): + """ + Memory resource that uses ``malloc``/``free`` for + allocation/deallocation. + + Parameters + ---------- + headroom : size_t + Size of the reserved GPU memory as headroom + """ + pass + + cdef class PoolMemoryResource(UpstreamResourceAdaptor): def __cinit__( diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py index 4ca6805c8..6eb94da0f 100644 --- a/python/rmm/rmm/mr.py +++ b/python/rmm/rmm/mr.py @@ -24,7 +24,9 @@ ManagedMemoryResource, PoolMemoryResource, PrefetchResourceAdaptor, + SamHeadroomMemoryResource, StatisticsResourceAdaptor, + SystemMemoryResource, TrackingResourceAdaptor, UpstreamResourceAdaptor, _flush_logs, @@ -54,7 +56,9 @@ "ManagedMemoryResource", "PoolMemoryResource", "PrefetchResourceAdaptor", + "SamHeadroomMemoryResource", "StatisticsResourceAdaptor", + "SystemMemoryResource", "TrackingResourceAdaptor", "FailureCallbackResourceAdaptor", "UpstreamResourceAdaptor", diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index ff8ed9aad..c4fd90c45 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -38,6 +38,11 @@ _runtime_version >= 11020 ) +_SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( + cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, + rmm._cuda.gpu.getDevice(), +) + def array_tester(dtype, nelem, alloc): # data @@ -91,6 +96,39 @@ def test_rmm_modes(dtype, nelem, alloc, managed, pool): array_tester(dtype, nelem, alloc) +@pytest.mark.skipif( + not _SYSTEM_MEMORY_SUPPORTED, + reason="System memory not supported", +) +@pytest.mark.parametrize("dtype", _dtypes) +@pytest.mark.parametrize("nelem", _nelems) +@pytest.mark.parametrize("alloc", _allocs) +@pytest.mark.parametrize( + "system, pool, headroom", + list(product([False, True], [False, True], [False, True])), +) +def test_rmm_modes_system_memory(dtype, nelem, alloc, system, pool, headroom): + assert rmm.is_initialized() + array_tester(dtype, nelem, alloc) + + if system: + if headroom: + base_mr = rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20) + else: + base_mr = rmm.mr.SystemMemoryResource() + else: + base_mr = rmm.mr.CudaMemoryResource() + if pool: + mr = rmm.mr.PoolMemoryResource(base_mr) + else: + mr = base_mr + rmm.mr.set_current_device_resource(mr) + + assert rmm.is_initialized() + + array_tester(dtype, nelem, alloc) + + @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("nelem", _nelems) @pytest.mark.parametrize("alloc", _allocs) @@ -410,7 +448,15 @@ def test_pool_memory_resource(dtype, nelem, alloc): [ lambda: rmm.mr.CudaMemoryResource(), lambda: rmm.mr.ManagedMemoryResource(), - ], + ] + + ( + [ + lambda: rmm.mr.SystemMemoryResource(), + lambda: rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20), + ] + if _SYSTEM_MEMORY_SUPPORTED + else [] + ), ) def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream): mr = rmm.mr.FixedSizeMemoryResource( @@ -432,7 +478,15 @@ def test_fixed_size_memory_resource(dtype, nelem, alloc, upstream): lambda: rmm.mr.PoolMemoryResource( rmm.mr.CudaMemoryResource(), 1 << 20 ), - ], + ] + + ( + [ + lambda: rmm.mr.SystemMemoryResource(), + lambda: rmm.mr.SamHeadroomMemoryResource(headroom=1 << 20), + ] + if _SYSTEM_MEMORY_SUPPORTED + else [] + ), ) def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr): upstream = upstream_mr() diff --git a/tests/mr/device/system_mr_tests.cu b/tests/mr/device/system_mr_tests.cu index 079afeb14..00084c4e2 100644 --- a/tests/mr/device/system_mr_tests.cu +++ b/tests/mr/device/system_mr_tests.cu @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -54,9 +54,9 @@ void touch_on_gpu(void* ptr, std::size_t size) using system_mr = rmm::mr::system_memory_resource; static_assert(cuda::mr::resource_with); static_assert(cuda::mr::async_resource_with); -using headroom_adaptor = rmm::mr::sam_headroom_resource_adaptor; -static_assert(cuda::mr::resource_with); -static_assert(cuda::mr::async_resource_with); +using headroom_mr = rmm::mr::sam_headroom_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); class SystemMRTest : public ::testing::Test { protected: @@ -79,19 +79,6 @@ TEST(SystemMRSimpleTest, ThrowIfNotSupported) } } -TEST(SAMHeadroomAdaptorTest, ThrowIfNotSupported) -{ - auto construct_mr = []() { - system_mr mr; - headroom_adaptor adaptor{&mr, 0}; - }; - if (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())) { - EXPECT_NO_THROW(construct_mr()); - } else { - EXPECT_THROW(construct_mr(), rmm::logic_error); - } -} - TEST_F(SystemMRTest, FirstTouchOnCPU) { auto const free = rmm::available_device_memory().first; @@ -114,23 +101,21 @@ TEST_F(SystemMRTest, FirstTouchOnGPU) mr.deallocate(ptr, size_mb); } -TEST_F(SystemMRTest, AdaptorReserveAllFreeMemory) +TEST_F(SystemMRTest, HeadroomMRReserveAllFreeMemory) { auto const free = rmm::available_device_memory().first; - system_mr mr; // All the free GPU memory is set as headroom, so allocation is only on the CPU. - headroom_adaptor adaptor{&mr, free + size_gb}; - void* ptr = adaptor.allocate(size_mb); + headroom_mr mr{free + size_gb}; + void* ptr = mr.allocate(size_mb); touch_on_cpu(ptr, size_mb); - adaptor.deallocate(ptr, size_mb); + mr.deallocate(ptr, size_mb); } -TEST_F(SystemMRTest, AdaptorDifferentParametersUnequal) +TEST_F(SystemMRTest, HeadroomMRDifferentParametersUnequal) { - system_mr mr; - headroom_adaptor adaptor1{&mr, size_mb}; - headroom_adaptor adaptor2{&mr, size_gb}; - EXPECT_FALSE(adaptor1.is_equal(adaptor2)); + headroom_mr mr1{size_mb}; + headroom_mr mr2{size_gb}; + EXPECT_FALSE(mr1.is_equal(mr2)); } } // namespace } // namespace rmm::test