From 39edcd0e91f0d97a986803fb10815c089274f557 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 31 Aug 2023 05:36:59 +0000
Subject: [PATCH 01/11] Disable clang-tidy check for do while in macros

---
 .clang-tidy | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.clang-tidy b/.clang-tidy
index 04689c330..9b3f844c9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -64,4 +64,6 @@ CheckOptions:
     value:           '1'
   - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
     value:           '1'
+  - key:             cppcoreguidelines-avoid-do-while.IgnoreMacros
+    value:           'true'
 ...

From 421007be66af3dd714d00d68a6c2eb0fa99ea43a Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 31 Aug 2023 06:33:01 +0000
Subject: [PATCH 02/11] Add new CUDA device utilities

---
 include/rmm/cuda_device.hpp                   | 49 +++++++++++++++++--
 include/rmm/detail/dynamic_load_runtime.hpp   |  7 ++-
 .../mr/device/cuda_async_memory_resource.hpp  |  2 +-
 .../cuda_async_view_memory_resource.hpp       |  2 +-
 include/rmm/mr/device/per_device_resource.hpp |  4 +-
 tests/mr/device/cuda_async_view_mr_tests.cpp  |  4 +-
 6 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
index ab225490e..e57d779d9 100644
--- a/include/rmm/cuda_device.hpp
+++ b/include/rmm/cuda_device.hpp
@@ -42,7 +42,6 @@ struct cuda_device_id {
   value_type id_;
 };
 
-namespace detail {
 /**
  * @brief Returns a `cuda_device_id` for the current device
  *
@@ -50,11 +49,51 @@ namespace detail {
  *
  * @return `cuda_device_id` for the current device
  */
-inline cuda_device_id current_device()
+inline cuda_device_id get_current_cuda_device()
 {
-  int dev_id{};
-  RMM_CUDA_TRY(cudaGetDevice(&dev_id));
+  cuda_device_id::value_type dev_id{};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
   return cuda_device_id{dev_id};
 }
-}  // namespace detail
+
+/**
+ * @brief Returns the number of CUDA devices in the system
+ *
+ * @return Number of CUDA devices in the system
+ */
+inline int get_num_cuda_devices()
+{
+  cuda_device_id::value_type num_dev{};
+  RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
+  return num_dev;
+}
+
+/**
+ * @brief RAII class that sets the current CUDA device to the specified device on construction
+ * and restores the previous device on destruction.
+ */
+struct cuda_set_device_raii {
+  /**
+   * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id`
+   *
+   * @param dev_id The device to set as the current CUDA device
+   */
+  explicit cuda_set_device_raii(cuda_device_id dev_id) : old_device_{get_current_cuda_device()}
+  {
+    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+  }
+  /**
+   * @brief Reactivates the previous CUDA device
+   */
+  ~cuda_set_device_raii() noexcept { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); }
+
+  cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
+  cuda_set_device_raii(cuda_set_device_raii&&)                 = delete;
+  cuda_set_device_raii& operator=(cuda_set_device_raii&&)      = delete;
+
+ private:
+  cuda_device_id old_device_;
+};
+
 }  // namespace rmm
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
index 28121e6a8..dc0a912e7 100644
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -113,9 +113,8 @@ struct async_alloc {
 
     static auto driver_supports_pool{[] {
       int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::detail::current_device().value());
+      auto result = cudaDeviceGetAttribute(
+        &cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, rmm::get_current_cuda_device().value());
       return result == cudaSuccess and cuda_pool_supported == 1;
     }()};
     return runtime_supports_pool and driver_supports_pool;
@@ -139,7 +138,7 @@ struct async_alloc {
     if (cudaMemHandleTypeNone != handle_type) {
       auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
                                                  cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::detail::current_device().value());
+                                                 rmm::get_current_cuda_device().value());
 
       // Don't throw on cudaErrorInvalidValue
       auto const unsupported_runtime = (result == cudaErrorInvalidValue);
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index d41eae63e..329d8f29a 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
                 "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
-    pool_props.location.id   = rmm::detail::current_device().value();
+    pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
     RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 191e4741d..fc40baac3 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
       }()}
   {
     // Check if cudaMallocAsync Memory pool supported
-    auto const device = rmm::detail::current_device();
+    auto const device = rmm::get_current_cuda_device();
     int cuda_pool_supported{};
     auto result =
       cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value());
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index 4ddbd874a..f37f213a9 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -196,7 +196,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
  */
 inline device_memory_resource* get_current_device_resource()
 {
-  return get_per_device_resource(rmm::detail::current_device());
+  return get_per_device_resource(rmm::get_current_cuda_device());
 }
 
 /**
@@ -225,6 +225,6 @@ inline device_memory_resource* get_current_device_resource()
  */
 inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)
 {
-  return set_per_device_resource(rmm::detail::current_device(), new_mr);
+  return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
 }  // namespace rmm::mr
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index 86cb6f106..209429b4b 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -31,7 +31,7 @@ TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
   RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::detail::current_device().value()));
+    &memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 {
   cudaMemPoolProps poolProps = {};
   poolProps.allocType        = cudaMemAllocationTypePinned;
-  poolProps.location.id      = rmm::detail::current_device().value();
+  poolProps.location.id      = rmm::get_current_cuda_device().value();
   poolProps.location.type    = cudaMemLocationTypeDevice;
 
   cudaMemPool_t memPool{};

From 429c71d5dfa475cee28b4d3ebcd726102ffbdbf5 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 31 Aug 2023 06:33:27 +0000
Subject: [PATCH 03/11] Add failing (in debug build) multidevice pool test

---
 tests/mr/device/pool_mr_tests.cpp | 39 +++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index c5df1951c..a43088c4e 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include "rmm/cuda_device.hpp"
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
@@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo)
   mr2.deallocate(ptr, 1024);
 }
 
+TEST(PoolTest, MultidevicePool)
+{
+  using MemoryResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;
+
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::mr::cuda_memory_resource general_mr;
+
+    // initializing pool_memory_resource of multiple devices
+    int devices      = 2;
+    size_t pool_size = 1024;
+    std::vector<std::shared_ptr<MemoryResource>> mrs;
+
+    for (int i = 0; i < devices; ++i) {
+      RMM_CUDA_TRY(cudaSetDevice(i));
+      auto mr = std::make_shared<MemoryResource>(&general_mr, pool_size, pool_size);
+      rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get());
+      mrs.emplace_back(mr);
+    }
+
+    {
+      RMM_CUDA_TRY(cudaSetDevice(0));
+      rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get());
+
+      {
+        RMM_CUDA_TRY(cudaSetDevice(1));
+        rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get());
+      }
+
+      RMM_CUDA_TRY(cudaSetDevice(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace rmm::test

From f01d797cd0aa2d0d1c6dd5662dbdc99da8b59e24 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 31 Aug 2023 07:12:39 +0000
Subject: [PATCH 04/11] Store per-device TLS event wrappers instead of a single
 event wrapper for each thread.

---
 .../detail/stream_ordered_memory_resource.hpp | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index 53575e5ce..5f044b537 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -15,15 +15,16 @@
  */
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
-
 #include <cuda_runtime_api.h>
 
+#include <fmt/core.h>
+
 #include <cstddef>
 #include <functional>
 #include <limits>
@@ -288,17 +289,26 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   stream_event_pair get_event(cuda_stream_view stream)
   {
     if (stream.is_per_thread_default()) {
-      // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR
-      // instance ensures it is destroyed cleaned up only after all are finished with it.
-      thread_local auto event_tls = std::make_shared<event_wrapper>();
-      default_stream_events.insert(event_tls);
-      return stream_event_pair{stream.value(), event_tls->event};
+      // Create a thread-local shared event wrapper for each device. Shared pointers in the thread
+      // and in each MR instance ensure the wrappers are destroyed only after all are finished
+      // with them.
+      thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
+        rmm::get_num_cuda_devices());
+      auto event = [&, device_id = this->device_id_]() {
+        if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
+
+        auto event                    = std::make_shared<event_wrapper>();
+        events_tls[device_id.value()] = event;
+        this->default_stream_events.insert(event);
+        return event->event;
+      }();
+      return stream_event_pair{stream.value(), event};
     }
     // We use cudaStreamLegacy as the event map key for the default stream for consistency between
     // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
     // user explicitly passes it, so it is used as the default location for the free list
-    // at construction. For consistency, the same key is used for null stream free lists in non-PTDS
-    // mode.
+    // at construction. For consistency, the same key is used for null stream free lists in
+    // non-PTDS mode.
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
     auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value();
 
@@ -496,11 +506,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
   // bidirectional mapping between non-default streams and events
   std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;
 
-  // shared pointers to events keeps the events alive as long as either the thread that created them
-  // or the MR that is using them exists.
+  // shared pointers to events keeps the events alive as long as either the thread that created
+  // them or the MR that is using them exists.
   std::set<std::shared_ptr<event_wrapper>> default_stream_events;
 
   std::mutex mtx_;  // mutex for thread-safe access
-};                  // namespace detail
+
+  rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};
+};  // namespace detail
 
 }  // namespace rmm::mr::detail

From 79a298d7b1542b25cd46a231d8b25ea43a8050c5 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 31 Aug 2023 07:38:22 +0000
Subject: [PATCH 05/11] style

---
 include/rmm/detail/dynamic_load_runtime.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
index dc0a912e7..b45dbae25 100644
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ b/include/rmm/detail/dynamic_load_runtime.hpp
@@ -113,8 +113,9 @@ struct async_alloc {
 
     static auto driver_supports_pool{[] {
       int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(
-        &cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, rmm::get_current_cuda_device().value());
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::get_current_cuda_device().value());
       return result == cudaSuccess and cuda_pool_supported == 1;
     }()};
     return runtime_supports_pool and driver_supports_pool;

From d27f57468e4e78791f2bfc575cacc48a17c7d421 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 5 Sep 2023 02:35:11 +0000
Subject: [PATCH 06/11] Add compile_commands.json to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1ab57e4d4..ad6c8ebf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ DartConfiguration.tcl
 .DS_Store
 *.manifest
 *.spec
+compile_commands.json
 
 ## Python build directories & artifacts
 dist/

From f299c032bb48cb0af7b6d3ee37f2420f0e33b181 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 5 Sep 2023 02:35:27 +0000
Subject: [PATCH 07/11] Header in angle brackets

---
 tests/mr/device/pool_mr_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index a43088c4e..4a234d2f9 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "rmm/cuda_device.hpp"
+#include <rmm/cuda_device.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
@@ -102,7 +102,7 @@ TEST(PoolTest, ForceGrowth)
     EXPECT_NO_THROW(mr.allocate(1000));
     EXPECT_THROW(mr.allocate(4000), rmm::out_of_memory);  // too much
     EXPECT_NO_THROW(mr.allocate(500));
-    EXPECT_NO_THROW(mr.allocate(2000));                   // fits
+    EXPECT_NO_THROW(mr.allocate(2000));  // fits
   }
 }
 

From 28c094d0804d0d2fb01552477d879d60d203442f Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 6 Sep 2023 00:21:07 +0000
Subject: [PATCH 08/11] style

---
 tests/mr/device/pool_mr_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index 4a234d2f9..6bcd90527 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -102,7 +102,7 @@ TEST(PoolTest, ForceGrowth)
     EXPECT_NO_THROW(mr.allocate(1000));
     EXPECT_THROW(mr.allocate(4000), rmm::out_of_memory);  // too much
     EXPECT_NO_THROW(mr.allocate(500));
-    EXPECT_NO_THROW(mr.allocate(2000));  // fits
+    EXPECT_NO_THROW(mr.allocate(2000));                   // fits
   }
 }
 

From bbe46bd445090e349c9f5040652e8e7b8868baff Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 6 Sep 2023 23:44:51 +0000
Subject: [PATCH 09/11] Add doc and example about dtors under Multiple Devices

---
 README.md | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8d65b7d33..0166ea09b 100644
--- a/README.md
+++ b/README.md
@@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device
 ```c++
 std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
 for(int i = 0; i < N; ++i) {
-    cudaSetDevice(i); // set device i before creating MR
-    // Use a vector of unique_ptr to maintain the lifetime of the MRs
-    per_device_pools.push_back(std::make_unique<pool_memory_resource>());
-    // Set the per-device resource for device i
-    set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+  cudaSetDevice(i); // set device i before creating MR
+  // Use a vector of unique_ptr to maintain the lifetime of the MRs
+  per_device_pools.push_back(std::make_unique<pool_memory_resource>());
+  // Set the per-device resource for device i
+  set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
+}
+```
+
+Note that the CUDA device that is current when creating a `device_memory_resource` must also be
+current any time that `device_memory_resource` is used to deallocate memory, including in a
+destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
+(incorrect) example that assumes the above example loop has been run to create a
+`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(1)` on the
+line of the error comment.
+
+```c++
+{
+  RMM_CUDA_TRY(cudaSetDevice(0));
+  rmm::device_buffer buf_a(16);
+
+  {
+    RMM_CUDA_TRY(cudaSetDevice(1));
+    rmm::device_buffer buf_b(16);
+  }
+
+  // Error: when buf_a is destroyed, the current device must be 0, but it is 1
 }
 ```
 

From 397336b64fd263574cb68d8b0e198310d307fbdf Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 13 Sep 2023 09:39:21 +1000
Subject: [PATCH 10/11] Fix device ID typo.

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0166ea09b..2fa4761df 100644
--- a/README.md
+++ b/README.md
@@ -366,7 +366,7 @@ Note that the CUDA device that is current when creating a `device_memory_resourc
 current any time that `device_memory_resource` is used to deallocate memory, including in a
 destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
 (incorrect) example that assumes the above example loop has been run to create a
-`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(1)` on the
+`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
 line of the error comment.
 
 ```c++

From 44c89cc8e0e099b4626f90498191bcd7325ea389 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 13 Sep 2023 00:25:03 +0000
Subject: [PATCH 11/11] Eliminate extraneous shared pointer copy and be smarter
 about cudaSetDevice in RAII class.

---
 include/rmm/cuda_device.hpp                       | 15 ++++++++++-----
 .../detail/stream_ordered_memory_resource.hpp     |  5 ++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
index cbed4de1a..8d355ee23 100644
--- a/include/rmm/cuda_device.hpp
+++ b/include/rmm/cuda_device.hpp
@@ -51,7 +51,7 @@ struct cuda_device_id {
  */
 inline cuda_device_id get_current_cuda_device()
 {
-  cuda_device_id::value_type dev_id{};
+  cuda_device_id::value_type dev_id{-1};
   RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
   return cuda_device_id{dev_id};
 }
@@ -63,7 +63,7 @@ inline cuda_device_id get_current_cuda_device()
  */
 inline int get_num_cuda_devices()
 {
-  cuda_device_id::value_type num_dev{};
+  cuda_device_id::value_type num_dev{-1};
   RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
   return num_dev;
 }
@@ -78,14 +78,18 @@ struct cuda_set_device_raii {
    *
    * @param dev_id The device to set as the current CUDA device
    */
-  explicit cuda_set_device_raii(cuda_device_id dev_id) : old_device_{get_current_cuda_device()}
+  explicit cuda_set_device_raii(cuda_device_id dev_id)
+    : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
   {
-    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
   }
   /**
    * @brief Reactivates the previous CUDA device
    */
-  ~cuda_set_device_raii() noexcept { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); }
+  ~cuda_set_device_raii() noexcept
+  {
+    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
+  }
 
   cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
   cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
@@ -94,6 +98,7 @@ struct cuda_set_device_raii {
 
  private:
   cuda_device_id old_device_;
+  bool needs_reset_;
 };
 
 }  // namespace rmm
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index 5f044b537..f071717c0 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -297,10 +297,9 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
       auto event = [&, device_id = this->device_id_]() {
         if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }
 
-        auto event                    = std::make_shared<event_wrapper>();
-        events_tls[device_id.value()] = event;
+        auto event = std::make_shared<event_wrapper>();
         this->default_stream_events.insert(event);
-        return event->event;
+        return (events_tls[device_id.value()] = std::move(event))->event;
       }();
       return stream_event_pair{stream.value(), event};
     }