From 777d0f05cefc615045ae50b18c7a95b3ca2edd0a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 30 Sep 2022 10:44:56 +0200
Subject: [PATCH] [DO NOT MERGE] PoC for the new design of
 `cuda::mr::{async_}resource_ref`

This adds support of the current `host_memory_resource` and `device_memory_resource` for the new design of `cuda::mr::{async_}resource_ref`

It is not perfect yet and meant as a first step to gain experience with the design and start early experimentation
---
 CMakeLists.txt                                |  11 +-
 cmake/thirdparty/get_libcudacxx.cmake         |  33 ++
 include/rmm/cuda_stream_view.hpp              |   6 +
 .../rmm/mr/device/device_memory_resource.hpp  | 171 +++++++++-
 include/rmm/mr/host/host_memory_resource.hpp  |  16 +
 tests/CMakeLists.txt                          |   6 +
 tests/mr/device/mr_ref_test.hpp               | 295 ++++++++++++++++++
 tests/mr/device/mr_ref_tests.cpp              | 169 ++++++++++
 tests/mr/host/mr_ref_tests.cpp                | 262 ++++++++++++++++
 tests/mr/host/mr_tests.cpp                    |   4 +
 10 files changed, 966 insertions(+), 7 deletions(-)
 create mode 100644 cmake/thirdparty/get_libcudacxx.cmake
 create mode 100644 tests/mr/device/mr_ref_test.hpp
 create mode 100644 tests/mr/device/mr_ref_tests.cpp
 create mode 100644 tests/mr/host/mr_ref_tests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23cde1de6..7aeb14063 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,13 +58,16 @@ rapids_find_package(
 rapids_cpm_init()
 include(cmake/thirdparty/get_spdlog.cmake)
 include(cmake/thirdparty/get_thrust.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 
 # library targets
 add_library(rmm INTERFACE)
 add_library(rmm::rmm ALIAS rmm)
 
-target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                                         "$<INSTALL_INTERFACE:include>")
+target_include_directories(
+  rmm
+  INTERFACE "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>"
+            "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:include>")
 
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
@@ -109,6 +112,10 @@ include(CPack)
 # install export targets
 install(TARGETS rmm EXPORT rmm-exports)
 install(DIRECTORY include/rmm/ DESTINATION include/rmm)
+install(
+  DIRECTORY ${RMM_GENERATED_INCLUDE_DIR}/include/libcxx
+            ${RMM_GENERATED_INCLUDE_DIR}/include/libcudacxx
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rmm)
 install(FILES ${RMM_BINARY_DIR}/include/rmm/version_config.hpp DESTINATION include/rmm)
 
 set(doc_string
diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 000000000..d5dd00ad0
--- /dev/null
+++ b/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone libcudacxx
+function(find_and_configure_libcudacxx VERSION)
+  rapids_cpm_find(
+    libcudacxx ${VERSION}
+    GIT_REPOSITORY https://github.com/miscco/libcudacxx.git
+    GIT_TAG memory_resource # ${VERSION}
+    GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE)
+
+  set(LIBCUDACXX_INCLUDE_DIR
+      "${libcudacxx_SOURCE_DIR}/include"
+      PARENT_SCOPE)
+  set(LIBCXX_INCLUDE_DIR
+      "${libcudacxx_SOURCE_DIR}/libcxx/include"
+      PARENT_SCOPE)
+endfunction()
+
+set(RMM_MIN_VERSION_libcudacxx 1.5.0)
+
+find_and_configure_libcudacxx(${RMM_MIN_VERSION_libcudacxx})
\ No newline at end of file
diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp
index f913609f9..29cd9063f 100644
--- a/include/rmm/cuda_stream_view.hpp
+++ b/include/rmm/cuda_stream_view.hpp
@@ -23,6 +23,7 @@
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
+#include <cuda/stream_ref>
 
 namespace rmm {
 
@@ -60,6 +61,11 @@ class cuda_stream_view {
    * @brief Implicit conversion to cudaStream_t.
    */
   constexpr operator cudaStream_t() const noexcept { return value(); }
+  
+  /**
+   * @brief Implicit conversion to stream_ref.
+   */
+  constexpr operator cuda::stream_ref() const noexcept { return value(); }
 
   /**
    * @brief Return true if the wrapped stream is the CUDA per-thread default stream.
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 52aa8c79f..4ffb315f2 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -18,10 +18,16 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
 
+#include <cuda/memory_resource>
+
 #include <cstddef>
 #include <utility>
 
 namespace rmm::mr {
+/**
+ * @brief Tag type that enables the allocate_async interface
+ */
+struct supports_allocate_async {};
 
 /**
  * @brief Base class for all libcudf device memory allocation.
@@ -81,11 +87,11 @@ namespace rmm::mr {
  */
 class device_memory_resource {
  public:
-  device_memory_resource()                              = default;
-  virtual ~device_memory_resource()                     = default;
-  device_memory_resource(device_memory_resource const&) = default;
-  device_memory_resource& operator=(device_memory_resource const&) = default;
-  device_memory_resource(device_memory_resource&&) noexcept        = default;
+  device_memory_resource()                                             = default;
+  virtual ~device_memory_resource()                                    = default;
+  device_memory_resource(device_memory_resource const&)                = default;
+  device_memory_resource& operator=(device_memory_resource const&)     = default;
+  device_memory_resource(device_memory_resource&&) noexcept            = default;
   device_memory_resource& operator=(device_memory_resource&&) noexcept = default;
 
   /**
@@ -149,6 +155,158 @@ class device_memory_resource {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate(std::size_t bytes, std::size_t alignment)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   */
+  void deallocate(void* ptr, std::size_t bytes, std::size_t alignment)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  template <class Res = device_memory_resource,
+            std::enable_if_t<cuda::mr::has_property<Res, supports_allocate_async>, int> = 0>
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  template <class Res = device_memory_resource,
+            std::enable_if_t<cuda::mr::has_property<Res, supports_allocate_async>, int> = 0>
+  void* allocate_async(std::size_t bytes, cuda_stream_view stream)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, allocation_size_alignment), stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   * @param stream Stream on which to perform allocation
+   */
+  template <class Res = device_memory_resource,
+            std::enable_if_t<cuda::mr::has_property<Res, supports_allocate_async>, int> = 0>
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda_stream_view stream)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform allocation
+   */
+  template <class Res = device_memory_resource,
+            std::enable_if_t<cuda::mr::has_property<Res, supports_allocate_async>, int> = 0>
+  void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, allocation_size_alignment), stream);
+  }
+
+  [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
   /**
    * @brief Query whether the resource supports use of non-null CUDA streams for
    * allocation/deallocation.
@@ -177,6 +335,8 @@ class device_memory_resource {
     return do_get_mem_info(stream);
   }
 
+  friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {}
+
  private:
   // All allocations are padded to a multiple of allocation_size_alignment bytes.
   static constexpr auto allocation_size_alignment = std::size_t{8};
@@ -238,4 +398,5 @@ class device_memory_resource {
   [[nodiscard]] virtual std::pair<std::size_t, std::size_t> do_get_mem_info(
     cuda_stream_view stream) const = 0;
 };
+static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>, "");
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index 4edffc860..40da70ff8 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cuda/memory_resource>
+
 #include <cstddef>
 #include <utility>
 
@@ -107,6 +109,18 @@ class host_memory_resource {
   {
     return do_is_equal(other);
   }
+  
+  [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
+  friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {}
 
  private:
   /**
@@ -160,4 +174,6 @@ class host_memory_resource {
     return this == &other;
   }
 };
+static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>, "");
+
 }  // namespace rmm::mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8f68141fd..971327a4c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -94,6 +94,9 @@ endfunction()
 # device mr tests
 ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp)
 
+# device mr tests
+ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp)
+
 # general adaptor tests
 ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 
@@ -130,6 +133,9 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp)
 # host mr tests
 ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp)
 
+# host mr_ref tests
+ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp)
+
 # cuda stream tests
 ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp)
 
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
new file mode 100644
index 000000000..7071583b7
--- /dev/null
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/fixed_size_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <utility>
+
+namespace rmm::test {
+
+/**
+ * @brief Returns if a pointer points to a device memory or managed memory
+ * allocation.
+ */
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+enum size_in_bytes : size_t {};
+
+constexpr auto default_num_allocations{100};
+constexpr size_in_bytes default_max_size{5_MiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+
+// Various test functions, shared between single-threaded and multithreaded tests.
+
+inline void test_get_current_device_resource()
+{
+  EXPECT_NE(nullptr, rmm::mr::get_current_device_resource());
+  void* ptr = rmm::mr::get_current_device_resource()->allocate(1_MiB);
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+  EXPECT_TRUE(is_device_memory(ptr));
+  rmm::mr::get_current_device_resource()->deallocate(ptr, 1_MiB);
+}
+
+inline void test_allocate(rmm::mr::device_memory_resource* mr,
+                          std::size_t bytes,
+                          cuda_stream_view stream = {})
+{
+  void* ptr = mr->allocate(bytes);
+  if (not stream.is_default()) { stream.synchronize(); }
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+  EXPECT_TRUE(is_device_memory(ptr));
+  mr->deallocate(ptr, bytes);
+  if (not stream.is_default()) { stream.synchronize(); }
+}
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+inline void concurrent_allocations_are_different(rmm::mr::device_memory_resource* mr,
+                                                 cuda_stream_view stream)
+{
+  const auto size{8_B};
+  void* ptr1 = mr->allocate(size, stream);
+  void* ptr2 = mr->allocate(size, stream);
+
+  EXPECT_NE(ptr1, ptr2);
+
+  mr->deallocate(ptr1, size, stream);
+  mr->deallocate(ptr2, size, stream);
+}
+
+inline void test_various_allocations(rmm::mr::device_memory_resource* mr, cuda_stream_view stream)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* ptr = mr->allocate(0, stream);
+    stream.synchronize();
+    EXPECT_NO_THROW(mr->deallocate(ptr, 0, stream));
+    stream.synchronize();
+  }
+
+  test_allocate(mr, 4_B, stream);
+  test_allocate(mr, 1_KiB, stream);
+  test_allocate(mr, 1_MiB, stream);
+  test_allocate(mr, 1_GiB, stream);
+
+  // should fail to allocate too much
+  {
+    void* ptr{nullptr};
+    EXPECT_THROW(ptr = mr->allocate(1_PiB, stream), rmm::out_of_memory);
+    EXPECT_EQ(nullptr, ptr);
+
+    // test e.what();
+    try {
+      ptr = mr->allocate(1_PiB, stream);
+    } catch (rmm::out_of_memory const& e) {
+      EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+    }
+  }
+}
+
+inline void test_random_allocations(rmm::mr::device_memory_resource* mr,
+                                    std::size_t num_allocations = default_num_allocations,
+                                    size_in_bytes max_size      = default_max_size,
+                                    cuda_stream_view stream     = {})
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // num_allocations allocations from [0,max_size)
+  std::for_each(allocations.begin(),
+                allocations.end(),
+                [&generator, &distribution, stream, mr](allocation& alloc) {
+                  alloc.size = distribution(generator);
+                  EXPECT_NO_THROW(alloc.ptr = mr->allocate(alloc.size, stream));
+                  if (not stream.is_default()) { stream.synchronize(); }
+                  EXPECT_NE(nullptr, alloc.ptr);
+                  EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr));
+                });
+
+  std::for_each(allocations.begin(), allocations.end(), [stream, mr](allocation& alloc) {
+    EXPECT_NO_THROW(mr->deallocate(alloc.ptr, alloc.size, stream));
+    if (not stream.is_default()) { stream.synchronize(); }
+  });
+}
+
+inline void test_mixed_random_allocation_free(rmm::mr::device_memory_resource* mr,
+                                              size_in_bytes max_size  = default_max_size,
+                                              cuda_stream_view stream = {})
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability{53};  // percent
+  constexpr int max_probability{99};
+  std::uniform_int_distribution<int> op_distribution(0, max_probability);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  std::size_t active_allocations{0};
+  std::size_t allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (std::size_t i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      std::size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(mr->allocate(size, stream), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr));
+    } else {
+      auto const index = static_cast<int>(index_distribution(generator) % active_allocations);
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(mr->deallocate(to_free.ptr, to_free.size, stream));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
+
+/// Encapsulates a `device_memory_resource` factory function and associated name
+struct mr_factory {
+  mr_factory(std::string name, MRFactoryFunc factory)
+    : name{std::move(name)}, factory{std::move(factory)}
+  {
+  }
+
+  std::string name;       ///< Name to associate with tests that use this factory
+  MRFactoryFunc factory;  ///< Factory function that returns shared_ptr to `device_memory_resource`
+                          ///< instance to use in test
+};
+
+/// Test fixture class value-parameterized on different `mr_factory`s
+struct mr_test : public ::testing::TestWithParam<mr_factory> {
+  void SetUp() override
+  {
+    auto factory = GetParam().factory;
+    mr           = factory();
+    if (mr == nullptr) {
+      GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA "
+                   << "driver/runtime version";
+    }
+    ref = cuda::mr::resource_ref<cuda::mr::device_accessible>{*mr};
+  }
+
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;  ///< Pointer to resource to use in tests
+  cuda::mr::resource_ref<cuda::mr::device_accessible> ref{*mr};
+  rmm::cuda_stream stream{};
+};
+
+struct mr_allocation_test : public mr_test {
+};
+
+/// MR factory functions
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_cuda_async()
+{
+  if (rmm::detail::async_alloc::is_supported()) {
+    return std::make_shared<rmm::mr::cuda_async_memory_resource>();
+  }
+  return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
+}
+
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+
+inline auto make_arena()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+}
+
+inline auto make_fixed_size()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::fixed_size_memory_resource>(make_cuda());
+}
+
+inline auto make_binning()
+{
+  auto pool = make_pool();
+  // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
+  // Larger allocations will use the pool resource
+  auto const bin_range_start{18};
+  auto const bin_range_end{22};
+
+  auto mr = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(
+    pool, bin_range_start, bin_range_end);
+  return mr;
+}
+
+}  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
new file mode 100644
index 000000000..fa14eae28
--- /dev/null
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mr_ref_test.hpp"
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+using resource_ref = cuda::mr::resource_ref<cuda::mr::device_accessible>;
+
+namespace rmm::test {
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(ResourceTests,
+                         mr_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning},
+                                           mr_factory{"Fixed_Size", &make_fixed_size}),
+                         [](auto const& info) { return info.param.name; });
+
+// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes
+INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
+                         mr_allocation_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning}),
+                         [](auto const& info) { return info.param.name; });
+
+TEST(DefaultTest, CurrentDeviceResourceIsCUDA)
+{
+  EXPECT_NE(nullptr, rmm::mr::get_current_device_resource());
+  EXPECT_TRUE(rmm::mr::get_current_device_resource()->is_equal(rmm::mr::cuda_memory_resource{}));
+}
+
+TEST(DefaultTest, UseCurrentDeviceResource) { test_get_current_device_resource(); }
+
+TEST(DefaultTest, GetCurrentDeviceResource)
+{
+  resource_ref mr{*rmm::mr::get_current_device_resource()};
+  rmm::mr::cuda_memory_resource default_resource{};
+  resource_ref default_ref{default_resource}; 
+  EXPECT_TRUE(mr == default_ref);
+}
+
+TEST_P(mr_test, SetCurrentDeviceResource)
+{
+  rmm::mr::cuda_memory_resource default_resource{};
+  resource_ref default_ref{default_resource}; 
+
+  resource_ref old_ref{*rmm::mr::set_current_device_resource(this->mr.get())};
+
+  // old mr should equal a cuda mr
+  EXPECT_TRUE(old_ref == default_ref);
+
+  // current dev resource should equal this resource
+  EXPECT_TRUE(ref == this->ref);
+
+  test_get_current_device_resource();
+
+  // setting to `nullptr` should reset to initial cuda resource
+  rmm::mr::set_current_device_resource(nullptr);
+  resource_ref new_ref{*rmm::mr::get_current_device_resource()};
+  EXPECT_TRUE(new_ref == default_ref);
+}
+
+TEST_P(mr_test, SelfEquality) { EXPECT_TRUE(this->mr->is_equal(*this->mr)); }
+
+TEST_P(mr_test, SupportsStreams)
+{
+  if (this->mr->is_equal(rmm::mr::cuda_memory_resource{}) ||
+      this->mr->is_equal(rmm::mr::managed_memory_resource{})) {
+    EXPECT_FALSE(this->mr->supports_streams());
+  } else {
+    EXPECT_TRUE(this->mr->supports_streams());
+  }
+}
+
+TEST_P(mr_test, GetMemInfo)
+{
+  if (this->mr->supports_get_mem_info()) {
+    const auto allocation_size{16 * 256};
+    {
+      auto const [free, total] = this->mr->get_mem_info(rmm::cuda_stream_view{});
+      EXPECT_TRUE(free >= allocation_size);
+    }
+
+    void* ptr{nullptr};
+    ptr = this->mr->allocate(allocation_size);
+
+    {
+      auto const [free, total] = this->mr->get_mem_info(rmm::cuda_stream_view{});
+      EXPECT_TRUE(free >= allocation_size);
+    }
+
+    this->mr->deallocate(ptr, allocation_size);
+  } else {
+    auto const [free, total] = this->mr->get_mem_info(rmm::cuda_stream_view{});
+    EXPECT_EQ(free, 0);
+    EXPECT_EQ(total, 0);
+  }
+}
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+TEST_P(mr_test, AllocationsAreDifferentDefaultStream)
+{
+  concurrent_allocations_are_different(this->mr.get(), cuda_stream_view{});
+}
+
+TEST_P(mr_test, AllocationsAreDifferent)
+{
+  concurrent_allocations_are_different(this->mr.get(), this->stream);
+}
+
+TEST_P(mr_allocation_test, AllocateDefaultStream)
+{
+  test_various_allocations(this->mr.get(), cuda_stream_view{});
+}
+
+TEST_P(mr_allocation_test, AllocateOnStream)
+{
+  test_various_allocations(this->mr.get(), this->stream);
+}
+
+TEST_P(mr_allocation_test, RandomAllocations) { test_random_allocations(this->mr.get()); }
+
+TEST_P(mr_allocation_test, RandomAllocationsStream)
+{
+  test_random_allocations(this->mr.get(), default_num_allocations, default_max_size, this->stream);
+}
+
+TEST_P(mr_allocation_test, MixedRandomAllocationFree)
+{
+  test_mixed_random_allocation_free(this->mr.get(), default_max_size, cuda_stream_view{});
+}
+
+TEST_P(mr_allocation_test, MixedRandomAllocationFreeStream)
+{
+  test_mixed_random_allocation_free(this->mr.get(), default_max_size, this->stream);
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
new file mode 100644
index 000000000..eb84fd697
--- /dev/null
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/host/host_memory_resource.hpp>
+#include <rmm/mr/host/new_delete_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <deque>
+#include <random>
+
+namespace rmm::test {
+namespace {
+inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t))
+{
+  return rmm::detail::is_pointer_aligned(ptr, alignment);
+}
+
+// Returns true if a pointer points to a device memory or managed memory allocation.
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+/**
+ * @brief Returns if a pointer `p` points to pinned host memory.
+ */
+inline bool is_pinned_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+  return attributes.type == cudaMemoryTypeHost;
+}
+
+constexpr std::size_t size_word{4_B};
+constexpr std::size_t size_kb{1_KiB};
+constexpr std::size_t size_mb{1_MiB};
+constexpr std::size_t size_gb{1_GiB};
+constexpr std::size_t size_pb{1_PiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+}  // namespace
+
+template <typename MemoryResourceType>
+struct MRTest : public ::testing::Test {
+  MemoryResourceType mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref;
+
+  MRTest() : mr{}, ref{mr} {}
+};
+
+using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>, "");
+
+TYPED_TEST_CASE(MRTest, resources);
+
+TYPED_TEST(MRTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); }
+
+TYPED_TEST(MRTest, AllocateZeroBytes)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(0));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, 0));
+}
+
+TYPED_TEST(MRTest, AllocateWord)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_word));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word));
+}
+
+TYPED_TEST(MRTest, AllocateKB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb));
+}
+
+TYPED_TEST(MRTest, AllocateMB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb));
+}
+
+TYPED_TEST(MRTest, AllocateGB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb));
+}
+
+TYPED_TEST(MRTest, AllocateTooMuch)
+{
+  void* ptr{nullptr};
+  EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc);
+  EXPECT_EQ(nullptr, ptr);
+}
+
+TYPED_TEST(MRTest, RandomAllocations)
+{
+  constexpr std::size_t num_allocations{100};
+  std::vector<allocation> allocations(num_allocations);
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, MAX_ALLOCATION_SIZE);
+
+  // 100 allocations from [0,5MB)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) {
+      alloc.size = distribution(generator);
+      EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size));
+      EXPECT_NE(nullptr, alloc.ptr);
+      EXPECT_TRUE(is_aligned(alloc.ptr));
+    });
+
+  std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+  });
+}
+
+TYPED_TEST(MRTest, MixedRandomAllocationFree)
+{
+  std::default_random_engine generator;
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  // How often a free will occur. For example, if `1`, then every allocation
+  // will immediately be free'd. Or, if 4, on average, a free will occur after
+  // every 4th allocation
+  constexpr std::size_t FREE_FREQUENCY{4};
+  std::uniform_int_distribution<int> free_distribution(1, FREE_FREQUENCY);
+
+  std::deque<allocation> allocations;
+
+  constexpr std::size_t num_allocations{100};
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    std::size_t allocation_size = size_distribution(generator);
+    EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size));
+    auto new_allocation = allocations.back();
+    EXPECT_NE(nullptr, new_allocation.ptr);
+    EXPECT_TRUE(is_aligned(new_allocation.ptr));
+
+    bool const free_front{free_distribution(generator) == free_distribution.max()};
+
+    if (free_front) {
+      auto front = allocations.front();
+      EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size));
+      allocations.pop_front();
+    }
+  }
+  // free any remaining allocations
+  for (auto alloc : allocations) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+    allocations.pop_front();
+  }
+}
+
+static constexpr std::size_t MinTestedAlignment{16};
+static constexpr std::size_t MaxTestedAlignment{4096};
+static constexpr std::size_t TestedAlignmentMultiplier{2};
+static constexpr std::size_t NUM_TRIALS{100};
+
+TYPED_TEST(MRTest, AlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignment));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment));
+    }
+  }
+}
+
+TYPED_TEST(MRTest, UnsupportedAlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      // An unsupported alignment (like an odd number) should result in an
+      // alignment of `alignof(std::max_align_t)`
+      auto const bad_alignment = alignment + 1;
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t)));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment));
+    }
+  }
+}
+
+TEST(PinnedResource, isPinned)
+{
+  rmm::mr::pinned_memory_resource mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref{mr};
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = ref.allocate(100));
+  EXPECT_TRUE(is_pinned_memory(ptr));
+  EXPECT_NO_THROW(ref.deallocate(ptr, 100));
+}
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp
index 1cd59f5a6..cc485f95e 100644
--- a/tests/mr/host/mr_tests.cpp
+++ b/tests/mr/host/mr_tests.cpp
@@ -23,6 +23,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 #include <cstddef>
@@ -80,6 +82,8 @@ struct MRTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>, "");
 
 TYPED_TEST_CASE(MRTest, resources);