From 650e73d9dc793c53d5e54fa5b3cf4f8986d18ca4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Jul 2024 18:14:41 +0800
Subject: [PATCH 1/5] Avoid thrust vector initialization.

- Add wrapper for rmm device uvector.
- Split up the `Resize` method for HDV.
---
 include/xgboost/host_device_vector.h        |   4 +-
 src/common/cuda_context.cuh                 |   2 +-
 src/common/device_helpers.cuh               | 260 ++--------------
 src/common/device_vector.cu                 |  25 ++
 src/common/device_vector.cuh                | 313 ++++++++++++++++++++
 src/common/host_device_vector.cc            |   5 +
 src/common/host_device_vector.cu            | 113 +++++--
 src/common/quantile.cuh                     |   8 +-
 src/metric/auc.cu                           |   3 +-
 src/tree/updater_gpu_hist.cu                |   4 +-
 tests/cpp/common/test_device_vector.cu      |  20 ++
 tests/cpp/common/test_host_device_vector.cu |  30 ++
 tests/cpp/data/test_array_interface.h       |  13 +-
 13 files changed, 517 insertions(+), 283 deletions(-)
 create mode 100644 src/common/device_vector.cu
 create mode 100644 src/common/device_vector.cuh
 create mode 100644 tests/cpp/common/test_device_vector.cu
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index 9a53d38583ca..c4d12da53079 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -135,7 +135,9 @@ class HostDeviceVector {
 
   void SetDevice(DeviceOrd device) const;
 
-  void Resize(size_t new_size, T v = T());
+  void Resize(std::size_t new_size);
+  /** @brief Resize and initialize the data if the new size is larger than the old size. */
+  void Resize(size_t new_size, T v);
 
   using value_type = T;  // NOLINT
 
diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh
index c8b2e07927c9..7e1db8e3bf2f 100644
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -18,7 +18,7 @@ struct CUDAContext {
    * \brief Caching thrust policy.
    */
   auto CTP() const {
-#if THRUST_MAJOR_VERSION >= 2
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
     return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
     return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 7cd00f6f6112..98a76d72a263 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,26 +1,21 @@
 /**
- * Copyright 2017-2023 XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #pragma once
-#include <thrust/binary_search.h>  // thrust::upper_bound
-#include <thrust/device_malloc_allocator.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+#include <thrust/binary_search.h>                       // thrust::upper_bound
+#include <thrust/device_ptr.h>                          // for device_ptr
+#include <thrust/device_vector.h>                       // for device_vector
 #include <thrust/execution_policy.h>                    // thrust::seq
-#include <thrust/gather.h>                              // gather
-#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/discard_iterator.h>           // for discard_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
-#include <thrust/logical.h>
-#include <thrust/sequence.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>  // for UnitWord
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -28,22 +23,14 @@
 
 #include "../collective/communicator-inl.h"
 #include "common.h"
+#include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
 
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include "rmm/mr/device/per_device_resource.hpp"
-#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
-#include "rmm/version_config.hpp"
-
-#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-#error "Please use RMM version 0.18 or later"
-#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
-#error "Please use RMM version 0.18 or later"
-#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#if defined(XGBOOST_USE_RMM)
+#include <rmm/exec_policy.hpp>
+#endif  // defined(XGBOOST_USE_RMM)
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 
@@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
   LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 
-namespace detail {
-/** \brief Keeps track of global device memory allocations. Thread safe.*/
-class MemoryLogger {
-  // Information for a single device
-  struct DeviceStats {
-    size_t currently_allocated_bytes{ 0 };
-    size_t peak_allocated_bytes{ 0 };
-    size_t num_allocations{ 0 };
-    size_t num_deallocations{ 0 };
-    std::map<void *, size_t> device_allocations;
-    void RegisterAllocation(void *ptr, size_t n) {
-      device_allocations[ptr] = n;
-      currently_allocated_bytes += n;
-      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
-      num_allocations++;
-      CHECK_GT(num_allocations, num_deallocations);
-    }
-    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
-      auto itr = device_allocations.find(ptr);
-      if (itr == device_allocations.end()) {
-        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
-                     << " that was never allocated\n"
-                     << dmlc::StackTrace();
-      } else {
-        num_deallocations++;
-        CHECK_LE(num_deallocations, num_allocations);
-        currently_allocated_bytes -= itr->second;
-        device_allocations.erase(itr);
-      }
-    }
-  };
-  DeviceStats stats_;
-  std::mutex mutex_;
-
-public:
-  void RegisterAllocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterAllocation(ptr, n);
-  }
-  void RegisterDeallocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterDeallocation(ptr, n, current_device);
-  }
-  size_t PeakMemory() const {
-    return stats_.peak_allocated_bytes;
-  }
-  size_t CurrentlyAllocatedBytes() const {
-    return stats_.currently_allocated_bytes;
-  }
-  void Clear()
-  {
-    stats_ = DeviceStats();
-  }
-
-  void Log() {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
-      << " ========";
-    LOG(CONSOLE) << "Peak memory usage: "
-      << stats_.peak_allocated_bytes / 1048576 << "MiB";
-    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
-  }
-};
-}  // namespace detail
-
-inline detail::MemoryLogger &GlobalMemoryLogger() {
-  static detail::MemoryLogger memory_logger;
-  return memory_logger;
-}
-
 // dh::DebugSyncDevice(__FILE__, __LINE__);
 inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
   if (file != "" && line != -1) {
@@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
   safe_cuda(cudaGetLastError());
 }
 
-namespace detail {
-
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
-#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-
-inline void ThrowOOMError(std::string const& err, size_t bytes) {
-  auto device = CurrentDevice();
-  auto rank = xgboost::collective::GetRank();
-  std::stringstream ss;
-  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
-     << "- Free memory: " << AvailableMemory(device) << "\n"
-     << "- Requested memory: " << bytes << std::endl;
-  LOG(FATAL) << ss.str();
-}
-
-/**
- * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
- */
-template <class T>
-struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  pointer allocate(size_t n) {  // NOLINT
-    pointer ptr;
-    try {
-      ptr = SuperT::allocate(n);
-      dh::safe_cuda(cudaGetLastError());
-    } catch (const std::exception &e) {
-      ThrowOOMError(e.what(), n * sizeof(T));
-    }
-    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
-    return ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    SuperT::deallocate(ptr, n);
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBDefaultDeviceAllocatorImpl()
-    : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-};
-
-/**
- * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
- *        RMM pool allocator is enabled. Does not initialise memory on construction.
- */
-template <class T>
-struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
-    // Configure allocator with maximum cached bin size of ~1GB and no limit on
-    // maximum cached bytes
-    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
-        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
-    return *allocator;
-  }
-  pointer allocate(size_t n) {  // NOLINT
-    pointer thrust_ptr;
-    if (use_cub_allocator_) {
-      T* raw_ptr{nullptr};
-      auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
-                                                              n * sizeof(T));
-      if (errc != cudaSuccess) {
-        ThrowOOMError("Caching allocator", n * sizeof(T));
-      }
-      thrust_ptr = pointer(raw_ptr);
-    } else {
-      try {
-        thrust_ptr = SuperT::allocate(n);
-        dh::safe_cuda(cudaGetLastError());
-      } catch (const std::exception &e) {
-        ThrowOOMError(e.what(), n * sizeof(T));
-      }
-    }
-    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
-    return thrust_ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    if (use_cub_allocator_) {
-      GetGlobalCachingAllocator().DeviceFree(ptr.get());
-    } else {
-      SuperT::deallocate(ptr, n);
-    }
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBCachingDeviceAllocatorImpl()
-      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
-        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
- private:
-  bool use_cub_allocator_{true};
-};
-}  // namespace detail
-
-// Declare xgboost allocators
-// Replacement of allocator with custom backend should occur here
-template <typename T>
-using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
-/*! Be careful that the initialization constructor is a no-op, which means calling
- *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
- * `vec.resize(n, 0)`*/
-template <typename T>
-using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
-/** \brief Specialisation of thrust device vector using custom allocator. */
-template <typename T>
-using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
-template <typename T>
-using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
-
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
@@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
   return ToSpan(vec, offset, size);
 }
 
+template <typename T>
+xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
+  return {vec.data(), vec.size()};
+}
+
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
@@ -1117,6 +896,15 @@ class CUDAStream {
   void Sync() { this->View().Sync(); }
 };
 
+inline auto CachingThrustPolicy() {
+  XGBCachingDeviceAllocator<char> alloc;
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+  return thrust::cuda::par_nosync(alloc).on(DefaultStream());
+#else
+  return thrust::cuda::par(alloc).on(DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+}
+
 // Force nvcc to load data as constant
 template <typename T>
 class LDGIterator {
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
new file mode 100644
index 000000000000..6c1f6e8a4376
--- /dev/null
+++ b/src/common/device_vector.cu
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2017-2024, XGBoost contributors
+ */
+#include "../collective/communicator-inl.h"  // for GetRank
+#include "device_helpers.cuh"                // for CurrentDevice
+#include "device_vector.cuh"
+
+namespace dh {
+namespace detail {
+void ThrowOOMError(std::string const &err, size_t bytes) {
+  auto device = CurrentDevice();
+  auto rank = xgboost::collective::GetRank();
+  std::stringstream ss;
+  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
+     << "- Free memory: " << dh::AvailableMemory(device) << "\n"
+     << "- Requested memory: " << bytes << std::endl;
+  LOG(FATAL) << ss.str();
+}
+}  // namespace detail
+
+LoggingResource *GlobalLoggingResource() {
+  static auto mr{std::make_unique<LoggingResource>()};
+  return mr.get();
+}
+}  // namespace dh
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
new file mode 100644
index 000000000000..e2d1f16cbb9e
--- /dev/null
+++ b/src/common/device_vector.cuh
@@ -0,0 +1,313 @@
+/**
+ * Copyright 2017-2024, XGBoost Contributors
+ */
+#pragma once
+#include <thrust/device_malloc_allocator.h>  // for device_malloc_allocator
+#include <thrust/device_ptr.h>               // for device_ptr
+#include <thrust/device_vector.h>            // for device_vector
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <rmm/device_uvector.hpp>                      // for device_uvector
+#include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
+#include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
+#include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
+#include <rmm/version_config.hpp>                      // for RMM_VERSION_MAJOR
+
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#error "Please use RMM version 0.18 or later"
+#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
+#error "Please use RMM version 0.18 or later"
+#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+#include <cstddef>                 // for size_t
+#include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
+#include <cub/util_device.cuh>     // for CurrentDevice
+#include <map>                     // for map
+#include <memory>                  // for unique_ptr
+
+#include "common.h"  // for safe_cuda
+#include "xgboost/logging.h"
+
+namespace dh {
+namespace detail {
+/** \brief Keeps track of global device memory allocations. Thread safe.*/
+class MemoryLogger {
+  // Information for a single device
+  struct DeviceStats {
+    std::size_t currently_allocated_bytes{0};
+    size_t peak_allocated_bytes{0};
+    size_t num_allocations{0};
+    size_t num_deallocations{0};
+    std::map<void *, size_t> device_allocations;
+    void RegisterAllocation(void *ptr, size_t n) {
+      device_allocations[ptr] = n;
+      currently_allocated_bytes += n;
+      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
+      num_allocations++;
+      CHECK_GT(num_allocations, num_deallocations);
+    }
+    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
+      auto itr = device_allocations.find(ptr);
+      if (itr == device_allocations.end()) {
+        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
+                     << " that was never allocated\n"
+                     << dmlc::StackTrace();
+      } else {
+        num_deallocations++;
+        CHECK_LE(num_deallocations, num_allocations);
+        currently_allocated_bytes -= itr->second;
+        device_allocations.erase(itr);
+      }
+    }
+  };
+  DeviceStats stats_;
+  std::mutex mutex_;
+
+ public:
+  void RegisterAllocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterAllocation(ptr, n);
+  }
+  void RegisterDeallocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
+  }
+  size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
+  size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
+  void Clear() { stats_ = DeviceStats(); }
+
+  void Log() {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    int current_device;
+    dh::safe_cuda(cudaGetDevice(&current_device));
+    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
+                 << " ========";
+    LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
+    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
+  }
+};
+
+void ThrowOOMError(std::string const &err, size_t bytes);
+}  // namespace detail
+
+inline detail::MemoryLogger &GlobalMemoryLogger() {
+  static detail::MemoryLogger memory_logger;
+  return memory_logger;
+}
+
+namespace detail {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+#else   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+/**
+ * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
+ */
+template <class T>
+struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  pointer allocate(size_t n) {  // NOLINT
+    pointer ptr;
+    try {
+      ptr = SuperT::allocate(n);
+      dh::safe_cuda(cudaGetLastError());
+    } catch (const std::exception &e) {
+      detail::ThrowOOMError(e.what(), n * sizeof(T));
+    }
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
+    return ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    SuperT::deallocate(ptr, n);
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBDefaultDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+};
+
+/**
+ * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
+ *        RMM pool allocator is enabled. Does not initialise memory on construction.
+ */
+template <class T>
+struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
+    // Configure allocator with maximum cached bin size of ~1GB and no limit on
+    // maximum cached bytes
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
+    return *allocator;
+  }
+  pointer allocate(size_t n) {  // NOLINT
+    pointer thrust_ptr;
+    if (use_cub_allocator_) {
+      T *raw_ptr{nullptr};
+      auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
+                                                             n * sizeof(T));
+      if (errc != cudaSuccess) {
+        detail::ThrowOOMError("Caching allocator", n * sizeof(T));
+      }
+      thrust_ptr = pointer(raw_ptr);
+    } else {
+      try {
+        thrust_ptr = SuperT::allocate(n);
+        dh::safe_cuda(cudaGetLastError());
+      } catch (const std::exception &e) {
+        detail::ThrowOOMError(e.what(), n * sizeof(T));
+      }
+    }
+    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
+    return thrust_ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    if (use_cub_allocator_) {
+      GetGlobalCachingAllocator().DeviceFree(ptr.get());
+    } else {
+      SuperT::deallocate(ptr, n);
+    }
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBCachingDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
+        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
+#endif                                   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
+ private:
+  bool use_cub_allocator_{true};
+};
+}  // namespace detail
+
+// Declare xgboost allocators
+// Replacement of allocator with custom backend should occur here
+template <typename T>
+using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
+
+/** Be careful that the initialization constructor is a no-op, which means calling
+ *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
+ * `vec.resize(n, 0)`
+ */
+template <typename T>
+using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
+
+/** @brief Specialisation of thrust device vector using custom allocator. */
+template <typename T>
+using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
+template <typename T>
+using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
+
+#if defined(XGBOOST_USE_RMM)
+/**
+ * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
+ */
+class LoggingResource : public rmm::mr::device_memory_resource {
+  rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
+
+ public:
+  LoggingResource() = default;
+  ~LoggingResource() override = default;
+  LoggingResource(LoggingResource const &) = delete;
+  LoggingResource &operator=(LoggingResource const &) = delete;
+  LoggingResource(LoggingResource &&) noexcept = default;
+  LoggingResource &operator=(LoggingResource &&) noexcept = default;
+
+  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
+    return mr_;
+  }
+  [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept {  // NOLINT
+    return mr_;
+  }
+
+  void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
+    try {
+      auto const ptr = mr_->allocate(bytes, stream);
+      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
+      return ptr;
+    } catch (rmm::bad_alloc const &e) {
+      detail::ThrowOOMError(e.what(), bytes);
+    }
+    return nullptr;
+  }
+
+  void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
+                     rmm::cuda_stream_view stream) override {
+    mr_->deallocate(ptr, bytes, stream);
+    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
+  }
+
+  [[nodiscard]] bool do_is_equal(  // NOLINT
+      device_memory_resource const &other) const noexcept override {
+    if (this == &other) {
+      return true;
+    }
+    auto const *cast = dynamic_cast<LoggingResource const *>(&other);
+    if (cast == nullptr) {
+      return mr_->is_equal(other);
+    }
+    return get_upstream_resource() == cast->get_upstream_resource();
+  }
+};
+
+LoggingResource *GlobalLoggingResource();
+
+/**
+ * @brief Container class that doesn't initialize the data.
+ */
+template <typename T>
+class DeviceUVector : public rmm::device_uvector<T> {
+  using Super = rmm::device_uvector<T>;
+
+ public:
+  static constexpr bool NeedInit() { return true; }
+
+ public:
+  explicit DeviceUVector(std::size_t n)
+      : Super{n, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+};
+
+#else
+
+/**
+ * @brief Without RMM, the initialization will happen.
+ */
+template <typename T>
+class DeviceUVector : thrust::device_vector<T, XGBDeviceAllocator<T>> {
+ public:
+  static constexpr bool NeedInit() { return false; }
+};
+
+#endif  // defined(XGBOOST_USE_RMM)
+}  // namespace dh
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index f4973c0428f0..de9e0614a38e 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
   impl_->Vec().resize(new_size, v);
 }
 
+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size) {
+  impl_->Vec().resize(new_size, T{});
+}
+
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
   std::fill(HostVector().begin(), HostVector().end(), v);
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 99448df21b7e..87a061ecc45a 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,22 +1,33 @@
 /**
  * Copyright 2017-2023 by XGBoost contributors
  */
-#include <thrust/fill.h>
 #include <thrust/device_ptr.h>
+#include <thrust/fill.h>
 
 #include <algorithm>
 #include <cstdint>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
+#include "device_helpers.cuh"
+#include "device_vector.cuh"
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"
-#include "device_helpers.cuh"
 
 namespace xgboost {
 
 // the handler to call instead of cudaSetDevice; only used for testing
 static void (*cudaSetDeviceHandler)(int) = nullptr;  // NOLINT
 
+template <typename T>
+void InitVectorIfNeeded(std::size_t offset, T const& v, dh::DeviceUVector<T>* out) {
+  auto& data = *out;
+  if (std::remove_reference_t<decltype(data)>::NeedInit()) {
+    thrust::fill(dh::CachingThrustPolicy(), data.begin() + offset, data.end(), v);
+  }
+}
+
 void SetCudaSetDeviceHandler(void (*handler)(int)) {
   cudaSetDeviceHandler = handler;
 }
@@ -28,7 +39,8 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(size, v);
+      data_d_->resize(size, rmm::cuda_stream_per_thread);
+      InitVectorIfNeeded(0, v, data_d_.get());
     } else {
       data_h_.resize(size, v);
     }
@@ -66,22 +78,22 @@ class HostDeviceVectorImpl {
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
   }
 
   const T* ConstDevicePointer() {
     LazySyncDevice(GPUAccess::kRead);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
   }
 
   common::Span<T> DeviceSpan() {
     LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_->data().get(), Size()};
+    return {this->DevicePointer(), Size()};
   }
 
   common::Span<const T> ConstDeviceSpan() {
     LazySyncDevice(GPUAccess::kRead);
-    return {data_d_->data().get(), Size()};
+    return {this->ConstDevicePointer(), Size()};
   }
 
   void Fill(T v) {  // NOLINT
@@ -91,7 +103,7 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(),
+      dh::LaunchN(data_d_->size(), dh::DefaultStream(),
                   [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
     }
   }
@@ -138,10 +150,9 @@ class HostDeviceVectorImpl {
       auto ptr = other->ConstDevicePointer();
       SetDevice();
       CHECK_EQ(this->Device(), other->Device());
-      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
-                                    ptr,
-                                    other->Size() * sizeof(T),
-                                    cudaMemcpyDeviceToDevice));
+      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
+                                    other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
+                                    dh::DefaultStream()));
     }
   }
 
@@ -171,20 +182,55 @@ class HostDeviceVectorImpl {
     }
   }
 
-  void Resize(size_t new_size, T v) {
-    if (new_size == Size()) { return; }
+  auto ResizeImpl(std::size_t new_size) {
     if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(new_size, v);
+      auto old_size = data_d_->size();
+      data_d_->resize(new_size, rmm::cuda_stream_per_thread);
+      return std::make_pair(old_size, DeviceOrd::kCUDA);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
-      data_h_.resize(new_size, v);
+      auto old_size = data_h_.size();
+      data_h_.resize(new_size);
+      return std::make_pair(old_size, DeviceOrd::kCPU);
+    }
+  }
+
+  void Resize(std::size_t new_size, T v) {
+    if (new_size == Size()) {
+      return;
+    }
+    // Track the size of init.
+    auto [old_size, device] = this->ResizeImpl(new_size);
+    if (new_size <= old_size) {
+      return;
+    }
+    switch (device) {
+      case DeviceOrd::kCPU: {
+        std::fill(data_h_.begin() + old_size, data_h_.end(), v);
+        break;
+      }
+      case DeviceOrd::kCUDA: {
+        InitVectorIfNeeded(old_size, v, data_d_.get());
+        break;
+      }
+      default: {
+        LOG(FATAL) << "Unexpected device type.";
+        break;
+      }
     }
   }
 
+  void Resize(std::size_t new_size) {
+    if (new_size == Size()) {
+      return;
+    }
+    this->ResizeImpl(new_size);
+  }
+
   void LazySyncHost(GPUAccess access) {
     if (HostCanAccess(access)) { return; }
     if (HostCanRead()) {
@@ -195,10 +241,8 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
     if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(),
-                             data_d_->data().get(),
-                             data_d_->size() * sizeof(T),
-                             cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
+                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
   }
 
   void LazySyncDevice(GPUAccess access) {
@@ -211,10 +255,9 @@ class HostDeviceVectorImpl {
     // data is on the host
     LazyResizeDevice(data_h_.size());
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
-                                  data_h_.data(),
-                                  data_d_->size() * sizeof(T),
-                                  cudaMemcpyHostToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
+                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
+                                  dh::DefaultStream()));
     gpu_access_ = access;
   }
 
@@ -229,7 +272,7 @@ class HostDeviceVectorImpl {
  private:
   DeviceOrd device_{DeviceOrd::CPU()};
   std::vector<T> data_h_{};
-  std::unique_ptr<dh::device_vector<T>> data_d_{};
+  std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
 
   void CopyToDevice(HostDeviceVectorImpl* other) {
@@ -239,8 +282,10 @@ class HostDeviceVectorImpl {
       LazyResizeDevice(Size());
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
-                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
+                                    thrust::raw_pointer_cast(other->data_d_->data()),
+                                    data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                    dh::DefaultStream()));
     }
   }
 
@@ -248,14 +293,15 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(Size());
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
-                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                  dh::DefaultStream()));
   }
 
   void LazyResizeDevice(size_t new_size) {
     if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_->resize(new_size);
+    data_d_->resize(new_size, rmm::cuda_stream_per_thread);
   }
 
   void SetDevice() {
@@ -267,7 +313,7 @@ class HostDeviceVectorImpl {
     }
 
     if (!data_d_) {
-      data_d_.reset(new dh::device_vector<T>);
+      data_d_.reset(new dh::DeviceUVector<T>{0});
     }
   }
 };
@@ -396,6 +442,11 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
   impl_->SetDevice(device);
 }
 
+template <typename T>
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
+  impl_->Resize(new_size);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
   impl_->Resize(new_size, v);
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 898da03a0dce..bfcfe4cccba0 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -4,12 +4,14 @@
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
 
-#include "xgboost/span.h"
-#include "xgboost/data.h"
+#include <thrust/logical.h>  // for any_of
+
+#include "categorical.h"
 #include "device_helpers.cuh"
 #include "quantile.h"
 #include "timer.h"
-#include "categorical.h"
+#include "xgboost/data.h"
+#include "xgboost/span.h"
 
 namespace xgboost {
 namespace common {
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 59199b092839..4155a7084481 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -1,7 +1,8 @@
 /**
  * Copyright 2021-2024, XGBoost Contributors
  */
-#include <thrust/copy.h>  // for copy
+#include <thrust/copy.h>     // for copy
+#include <thrust/logical.h>  // for any_of
 #include <thrust/scan.h>
 
 #include <cassert>
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index aa4f8fa27218..3d9c4e734fb4 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -840,9 +840,7 @@ class GPUHistMaker : public TreeUpdater {
     out["hist_train_param"] = ToJson(hist_maker_param_);
   }
 
-  ~GPUHistMaker() {  // NOLINT
-    dh::GlobalMemoryLogger().Log();
-  }
+  ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
 
   void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
new file mode 100644
index 000000000000..d7d66d87d997
--- /dev/null
+++ b/tests/cpp/common/test_device_vector.cu
@@ -0,0 +1,20 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/common/device_vector.cuh"
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+namespace dh {
+TEST(DeviceUVector, Basic) {
+  GlobalMemoryLogger().Clear();
+  std::int32_t verbosity{3};
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+  DeviceUVector<float> uvec{12};
+  auto peak = GlobalMemoryLogger().PeakMemory();
+  auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
+  ASSERT_EQ(peak, n_bytes);
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+}
+}  // namespace dh
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index 57e945cba9be..bd138f2fc2b7 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -181,4 +181,34 @@ TEST(HostDeviceVector, Empty) {
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
+
+TEST(HostDeviceVector, Resize) {
+  auto check = [&](HostDeviceVector<float> const& vec) {
+    auto const& h_vec = vec.ConstHostSpan();
+    for (std::size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(h_vec[i], i + 1);
+    }
+    for (std::size_t i = 4; i < vec.Size(); ++i) {
+      ASSERT_EQ(h_vec[i], 3.0);
+    }
+  };
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    vec.SetDevice(DeviceOrd::CUDA(0));
+    vec.ConstDeviceSpan();
+    ASSERT_TRUE(vec.DeviceCanRead());
+    ASSERT_FALSE(vec.DeviceCanWrite());
+    vec.DeviceSpan();
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    ASSERT_TRUE(vec.HostCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.HostCanWrite());
+    check(vec);
+  }
+}
 }  // namespace xgboost::common
diff --git a/tests/cpp/data/test_array_interface.h b/tests/cpp/data/test_array_interface.h
index 78bce76f53e7..dfe4f5a3ec5c 100644
--- a/tests/cpp/data/test_array_interface.h
+++ b/tests/cpp/data/test_array_interface.h
@@ -1,15 +1,14 @@
-// Copyright (c) 2019 by Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>  // for device
+#include <thrust/sequence.h>          // for sequence
 #include <xgboost/data.h>
 #include <xgboost/json.h>
-#include <thrust/device_vector.h>
-
-#include <memory>
-#include "../../../src/common/bitfield.h"
-#include "../../../src/common/device_helpers.cuh"
 
 namespace xgboost {
-
 template <typename T>
 Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
                          thrust::device_vector<T>* out_d_data) {

From 1853e32cd0e671b6bbe882ef0af6d1931d155d64 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Jul 2024 22:18:20 +0800
Subject: [PATCH 2/5] resize.

---
 src/common/device_helpers.cuh          |  2 +-
 src/common/device_vector.cu            |  2 ++
 src/common/device_vector.cuh           | 28 +++++++++++++--
 src/common/host_device_vector.cu       | 47 +++++++-------------------
 tests/cpp/common/test_device_vector.cu |  3 +-
 5 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 98a76d72a263..1754c9507036 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -510,7 +510,7 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
 
 template <typename T>
 xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
-  return {vec.data(), vec.size()};
+  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
 }
 
 // thrust begin, similiar to std::begin
diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu
index 6c1f6e8a4376..50922d8f978e 100644
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -18,8 +18,10 @@ void ThrowOOMError(std::string const &err, size_t bytes) {
 }
 }  // namespace detail
 
+#if defined(XGBOOST_USE_RMM)
 LoggingResource *GlobalLoggingResource() {
   static auto mr{std::make_unique<LoggingResource>()};
   return mr.get();
 }
+#endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index e2d1f16cbb9e..5e4f327efa56 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -8,6 +8,7 @@
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include <rmm/device_uvector.hpp>                      // for device_uvector
+#include <rmm/exec_policy.hpp>                         // for exec_policy_nosync
 #include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
 #include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
@@ -294,8 +295,20 @@ class DeviceUVector : public rmm::device_uvector<T> {
   static constexpr bool NeedInit() { return true; }
 
  public:
-  explicit DeviceUVector(std::size_t n)
-      : Super{n, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+
+  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
+  void Resize(std::size_t n, T const &v) {
+    auto orig = this->size();
+    Super::resize(n, rmm::cuda_stream_per_thread);
+    if (orig < n) {
+      thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
+    }
+  }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
 };
 
 #else
@@ -304,9 +317,18 @@ class DeviceUVector : public rmm::device_uvector<T> {
  * @brief Without RMM, the initialization will happen.
  */
 template <typename T>
-class DeviceUVector : thrust::device_vector<T, XGBDeviceAllocator<T>> {
+class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
+  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
+
  public:
   static constexpr bool NeedInit() { return false; }
+
+  void Resize(std::size_t n) { Super::resize(n); }
+  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, T const &v = T{});  // NOLINT
 };
 
 #endif  // defined(XGBOOST_USE_RMM)
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 87a061ecc45a..a075ee84bb80 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,19 +1,17 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
-#include <thrust/device_ptr.h>
 #include <thrust/fill.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <cstdint>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include "device_helpers.cuh"
-#include "device_vector.cuh"
+#include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost {
 
@@ -39,8 +37,7 @@ class HostDeviceVectorImpl {
     if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
-      data_d_->resize(size, rmm::cuda_stream_per_thread);
-      InitVectorIfNeeded(0, v, data_d_.get());
+      data_d_->Resize(size, v);
     } else {
       data_h_.resize(size, v);
     }
@@ -182,20 +179,19 @@ class HostDeviceVectorImpl {
     }
   }
 
-  auto ResizeImpl(std::size_t new_size) {
+  template <typename... U>
+  auto ResizeImpl(std::size_t new_size, U&&... args) {
     if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       auto old_size = data_d_->size();
-      data_d_->resize(new_size, rmm::cuda_stream_per_thread);
-      return std::make_pair(old_size, DeviceOrd::kCUDA);
+      data_d_->Resize(new_size, std::forward<U>(args)...);
     } else {
       // resize on host
       LazySyncHost(GPUAccess::kNone);
       auto old_size = data_h_.size();
-      data_h_.resize(new_size);
-      return std::make_pair(old_size, DeviceOrd::kCPU);
+      data_h_.resize(new_size, std::forward<U>(args)...);
     }
   }
 
@@ -203,25 +199,7 @@ class HostDeviceVectorImpl {
     if (new_size == Size()) {
       return;
     }
-    // Track the size of init.
-    auto [old_size, device] = this->ResizeImpl(new_size);
-    if (new_size <= old_size) {
-      return;
-    }
-    switch (device) {
-      case DeviceOrd::kCPU: {
-        std::fill(data_h_.begin() + old_size, data_h_.end(), v);
-        break;
-      }
-      case DeviceOrd::kCUDA: {
-        InitVectorIfNeeded(old_size, v, data_d_.get());
-        break;
-      }
-      default: {
-        LOG(FATAL) << "Unexpected device type.";
-        break;
-      }
-    }
+    this->ResizeImpl(new_size, v);
   }
 
   void Resize(std::size_t new_size) {
@@ -301,7 +279,7 @@ class HostDeviceVectorImpl {
   void LazyResizeDevice(size_t new_size) {
     if (data_d_ && new_size == data_d_->size()) { return; }
     SetDevice();
-    data_d_->resize(new_size, rmm::cuda_stream_per_thread);
+    data_d_->Resize(new_size);
   }
 
   void SetDevice() {
@@ -313,7 +291,7 @@ class HostDeviceVectorImpl {
     }
 
     if (!data_d_) {
-      data_d_.reset(new dh::DeviceUVector<T>{0});
+      data_d_.reset(new dh::DeviceUVector<T>{});
     }
   }
 };
@@ -478,5 +456,4 @@ template class HostDeviceVector<RTreeNodeStat>;
  */
 template class HostDeviceVector<std::size_t>;
 #endif  // defined(__APPLE__)
-
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu
index d7d66d87d997..95da4ef3f167 100644
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -11,7 +11,8 @@ TEST(DeviceUVector, Basic) {
   GlobalMemoryLogger().Clear();
   std::int32_t verbosity{3};
   std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
-  DeviceUVector<float> uvec{12};
+  DeviceUVector<float> uvec;
+  uvec.Resize(12);
   auto peak = GlobalMemoryLogger().PeakMemory();
   auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
   ASSERT_EQ(peak, n_bytes);

From 3c9579c026383d42ab0d359e539b33bfa6389ceb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 4 Jul 2024 23:33:27 +0800
Subject: [PATCH 3/5] Cleanup.

---
 src/common/device_vector.cuh                |  5 ----
 src/common/host_device_vector.cu            | 26 ++++-----------------
 tests/cpp/common/test_host_device_vector.cu |  9 ++++++-
 3 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
index 5e4f327efa56..35386856cc9c 100644
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -291,9 +291,6 @@ template <typename T>
 class DeviceUVector : public rmm::device_uvector<T> {
   using Super = rmm::device_uvector<T>;
 
- public:
-  static constexpr bool NeedInit() { return true; }
-
  public:
   DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
 
@@ -321,8 +318,6 @@ class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
   using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
 
  public:
-  static constexpr bool NeedInit() { return false; }
-
   void Resize(std::size_t n) { Super::resize(n); }
   void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index a075ee84bb80..967af9a632ca 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -18,14 +18,6 @@ namespace xgboost {
 // the handler to call instead of cudaSetDevice; only used for testing
 static void (*cudaSetDeviceHandler)(int) = nullptr;  // NOLINT
 
-template <typename T>
-void InitVectorIfNeeded(std::size_t offset, T const& v, dh::DeviceUVector<T>* out) {
-  auto& data = *out;
-  if (std::remove_reference_t<decltype(data)>::NeedInit()) {
-    thrust::fill(dh::CachingThrustPolicy(), data.begin() + offset, data.end(), v);
-  }
-}
-
 void SetCudaSetDeviceHandler(void (*handler)(int)) {
   cudaSetDeviceHandler = handler;
 }
@@ -181,6 +173,9 @@ class HostDeviceVectorImpl {
 
   template <typename... U>
   auto ResizeImpl(std::size_t new_size, U&&... args) {
+    if (new_size == Size()) {
+      return;
+    }
     if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
@@ -195,19 +190,8 @@ class HostDeviceVectorImpl {
     }
   }
 
-  void Resize(std::size_t new_size, T v) {
-    if (new_size == Size()) {
-      return;
-    }
-    this->ResizeImpl(new_size, v);
-  }
-
-  void Resize(std::size_t new_size) {
-    if (new_size == Size()) {
-      return;
-    }
-    this->ResizeImpl(new_size);
-  }
+  void Resize(std::size_t new_size, T v) { this->ResizeImpl(new_size, v); }
+  void Resize(std::size_t new_size) { this->ResizeImpl(new_size); }
 
   void LazySyncHost(GPUAccess access) {
     if (HostCanAccess(access)) { return; }
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index bd138f2fc2b7..a0aa5fa11fce 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
@@ -203,6 +203,13 @@ TEST(HostDeviceVector, Resize) {
     ASSERT_TRUE(vec.DeviceCanWrite());
     check(vec);
   }
+  {
+    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
   {
     HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
     ASSERT_TRUE(vec.HostCanWrite());

From b225a7c2208678139396433166d6715947235d37 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 5 Jul 2024 01:50:12 +0800
Subject: [PATCH 4/5] Fix.

---
 src/common/quantile.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index bfcfe4cccba0..3dd393755852 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -102,9 +102,9 @@ class SketchContainer {
     CHECK(device.IsCUDA());
     // Initialize Sketches for this dmatrix
     this->columns_ptr_.SetDevice(device_);
-    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_.Resize(num_columns + 1, 0);
     this->columns_ptr_b_.SetDevice(device_);
-    this->columns_ptr_b_.Resize(num_columns + 1);
+    this->columns_ptr_b_.Resize(num_columns + 1, 0);
 
     this->feature_types_.Resize(feature_types.Size());
     this->feature_types_.Copy(feature_types);

From a016a8ed91e8778d530cfb10412dfa6f1b041b71 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 11 Jul 2024 15:23:25 +0800
Subject: [PATCH 5/5] Small cleanup.

---
 include/xgboost/host_device_vector.h | 2 +-
 src/common/host_device_vector.cu     | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index c4d12da53079..36c7ed32b83d 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -137,7 +137,7 @@ class HostDeviceVector {
 
   void Resize(std::size_t new_size);
   /** @brief Resize and initialize the data if the new size is larger than the old size. */
-  void Resize(size_t new_size, T v);
+  void Resize(std::size_t new_size, T v);
 
   using value_type = T;  // NOLINT
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 967af9a632ca..16a1aa027f09 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -129,7 +129,7 @@ class HostDeviceVectorImpl {
 
   void Extend(HostDeviceVectorImpl* other) {
     auto ori_size = this->Size();
-    this->Resize(ori_size + other->Size(), T());
+    this->Resize(ori_size + other->Size(), T{});
     if (HostCanWrite() && other->HostCanRead()) {
       auto& h_vec = this->HostVector();
       auto& other_vec = other->HostVector();
@@ -172,7 +172,7 @@ class HostDeviceVectorImpl {
   }
 
   template <typename... U>
-  auto ResizeImpl(std::size_t new_size, U&&... args) {
+  auto Resize(std::size_t new_size, U&&... args) {
     if (new_size == Size()) {
       return;
     }
@@ -190,9 +190,6 @@ class HostDeviceVectorImpl {
     }
   }
 
-  void Resize(std::size_t new_size, T v) { this->ResizeImpl(new_size, v); }
-  void Resize(std::size_t new_size) { this->ResizeImpl(new_size); }
-
   void LazySyncHost(GPUAccess access) {
     if (HostCanAccess(access)) { return; }
     if (HostCanRead()) {
@@ -410,7 +407,7 @@ void HostDeviceVector<T>::Resize(std::size_t new_size) {
 }
 
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
   impl_->Resize(new_size, v);
 }