From 650e73d9dc793c53d5e54fa5b3cf4f8986d18ca4 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 4 Jul 2024 18:14:41 +0800 Subject: [PATCH 1/5] Avoid thrust vector initialization. - Add wrapper for rmm device uvector. - Split up the `Resize` method for HDV. --- include/xgboost/host_device_vector.h | 4 +- src/common/cuda_context.cuh | 2 +- src/common/device_helpers.cuh | 260 ++-------------- src/common/device_vector.cu | 25 ++ src/common/device_vector.cuh | 313 ++++++++++++++++++++ src/common/host_device_vector.cc | 5 + src/common/host_device_vector.cu | 113 +++++-- src/common/quantile.cuh | 8 +- src/metric/auc.cu | 3 +- src/tree/updater_gpu_hist.cu | 4 +- tests/cpp/common/test_device_vector.cu | 20 ++ tests/cpp/common/test_host_device_vector.cu | 30 ++ tests/cpp/data/test_array_interface.h | 13 +- 13 files changed, 517 insertions(+), 283 deletions(-) create mode 100644 src/common/device_vector.cu create mode 100644 src/common/device_vector.cuh create mode 100644 tests/cpp/common/test_device_vector.cu diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h index 9a53d38583ca..c4d12da53079 100644 --- a/include/xgboost/host_device_vector.h +++ b/include/xgboost/host_device_vector.h @@ -135,7 +135,9 @@ class HostDeviceVector { void SetDevice(DeviceOrd device) const; - void Resize(size_t new_size, T v = T()); + void Resize(std::size_t new_size); + /** @brief Resize and initialize the data if the new size is larger than the old size. */ + void Resize(size_t new_size, T v); using value_type = T; // NOLINT diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh index c8b2e07927c9..7e1db8e3bf2f 100644 --- a/src/common/cuda_context.cuh +++ b/src/common/cuda_context.cuh @@ -18,7 +18,7 @@ struct CUDAContext { * \brief Caching thrust policy. */ auto CTP() const { -#if THRUST_MAJOR_VERSION >= 2 +#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream()); #else return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 7cd00f6f6112..98a76d72a263 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1,26 +1,21 @@ /** - * Copyright 2017-2023 XGBoost contributors + * Copyright 2017-2024, XGBoost contributors */ #pragma once -#include // thrust::upper_bound -#include -#include -#include +#include // thrust::upper_bound +#include // for device_ptr +#include // for device_vector #include // thrust::seq -#include // gather -#include +#include // for discard_iterator #include // make_transform_output_iterator -#include -#include #include #include -#include #include #include #include // for size_t #include -#include +#include // for UnitWord #include #include #include @@ -28,22 +23,14 @@ #include "../collective/communicator-inl.h" #include "common.h" +#include "device_vector.cuh" #include "xgboost/host_device_vector.h" #include "xgboost/logging.h" #include "xgboost/span.h" -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include "rmm/mr/device/per_device_resource.hpp" -#include "rmm/mr/device/thrust_allocator_adaptor.hpp" -#include "rmm/version_config.hpp" - -#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) -#error "Please use RMM version 0.18 or later" -#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18 -#error "Please use RMM version 0.18 or later" -#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) - -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#if defined(XGBOOST_USE_RMM) +#include +#endif // defined(XGBOOST_USE_RMM) #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) @@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) { LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; }); } -namespace detail { -/** \brief Keeps track of global device memory allocations. Thread safe.*/ -class MemoryLogger { - // Information for a single device - struct DeviceStats { - size_t currently_allocated_bytes{ 0 }; - size_t peak_allocated_bytes{ 0 }; - size_t num_allocations{ 0 }; - size_t num_deallocations{ 0 }; - std::map device_allocations; - void RegisterAllocation(void *ptr, size_t n) { - device_allocations[ptr] = n; - currently_allocated_bytes += n; - peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes); - num_allocations++; - CHECK_GT(num_allocations, num_deallocations); - } - void RegisterDeallocation(void *ptr, size_t n, int current_device) { - auto itr = device_allocations.find(ptr); - if (itr == device_allocations.end()) { - LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device - << " that was never allocated\n" - << dmlc::StackTrace(); - } else { - num_deallocations++; - CHECK_LE(num_deallocations, num_allocations); - currently_allocated_bytes -= itr->second; - device_allocations.erase(itr); - } - } - }; - DeviceStats stats_; - std::mutex mutex_; - -public: - void RegisterAllocation(void *ptr, size_t n) { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - stats_.RegisterAllocation(ptr, n); - } - void RegisterDeallocation(void *ptr, size_t n) { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - stats_.RegisterDeallocation(ptr, n, current_device); - } - size_t PeakMemory() const { - return stats_.peak_allocated_bytes; - } - size_t CurrentlyAllocatedBytes() const { - return stats_.currently_allocated_bytes; - } - void Clear() - { - stats_ = DeviceStats(); - } - - void Log() { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " - << " ========"; - LOG(CONSOLE) << "Peak memory usage: " - << stats_.peak_allocated_bytes / 1048576 << "MiB"; - LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; - } -}; -} // namespace detail - -inline detail::MemoryLogger &GlobalMemoryLogger() { - static detail::MemoryLogger memory_logger; - return memory_logger; -} - // dh::DebugSyncDevice(__FILE__, __LINE__); inline void DebugSyncDevice(std::string file="", int32_t line = -1) { if (file != "" && line != -1) { @@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) { safe_cuda(cudaGetLastError()); } -namespace detail { - -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator; -#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - -inline void ThrowOOMError(std::string const& err, size_t bytes) { - auto device = CurrentDevice(); - auto rank = xgboost::collective::GetRank(); - std::stringstream ss; - ss << "Memory allocation error on worker " << rank << ": " << err << "\n" - << "- Free memory: " << AvailableMemory(device) << "\n" - << "- Requested memory: " << bytes << std::endl; - LOG(FATAL) << ss.str(); -} - -/** - * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. - */ -template -struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { - using SuperT = XGBBaseDeviceAllocator; - using pointer = thrust::device_ptr; // NOLINT - template - struct rebind // NOLINT - { - using other = XGBDefaultDeviceAllocatorImpl; // NOLINT - }; - pointer allocate(size_t n) { // NOLINT - pointer ptr; - try { - ptr = SuperT::allocate(n); - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception &e) { - ThrowOOMError(e.what(), n * sizeof(T)); - } - GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); - return ptr; - } - void deallocate(pointer ptr, size_t n) { // NOLINT - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - SuperT::deallocate(ptr, n); - } -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBDefaultDeviceAllocatorImpl() - : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {} -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -}; - -/** - * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless - * RMM pool allocator is enabled. Does not initialise memory on construction. - */ -template -struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { - using SuperT = XGBBaseDeviceAllocator; - using pointer = thrust::device_ptr; // NOLINT - template - struct rebind // NOLINT - { - using other = XGBCachingDeviceAllocatorImpl; // NOLINT - }; - cub::CachingDeviceAllocator& GetGlobalCachingAllocator() { - // Configure allocator with maximum cached bin size of ~1GB and no limit on - // maximum cached bytes - thread_local std::unique_ptr allocator{ - std::make_unique(2, 9, 29)}; - return *allocator; - } - pointer allocate(size_t n) { // NOLINT - pointer thrust_ptr; - if (use_cub_allocator_) { - T* raw_ptr{nullptr}; - auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&raw_ptr), - n * sizeof(T)); - if (errc != cudaSuccess) { - ThrowOOMError("Caching allocator", n * sizeof(T)); - } - thrust_ptr = pointer(raw_ptr); - } else { - try { - thrust_ptr = SuperT::allocate(n); - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception &e) { - ThrowOOMError(e.what(), n * sizeof(T)); - } - } - GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); - return thrust_ptr; - } - void deallocate(pointer ptr, size_t n) { // NOLINT - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - if (use_cub_allocator_) { - GetGlobalCachingAllocator().DeviceFree(ptr.get()); - } else { - SuperT::deallocate(ptr, n); - } - } -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBCachingDeviceAllocatorImpl() - : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()), - use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {} -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBOOST_DEVICE void construct(T *) {} // NOLINT - private: - bool use_cub_allocator_{true}; -}; -} // namespace detail - -// Declare xgboost allocators -// Replacement of allocator with custom backend should occur here -template -using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; -/*! Be careful that the initialization constructor is a no-op, which means calling - * `vec.resize(n)` won't initialize the memory region to 0. Instead use - * `vec.resize(n, 0)`*/ -template -using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; -/** \brief Specialisation of thrust device vector using custom allocator. */ -template -using device_vector = thrust::device_vector>; // NOLINT -template -using caching_device_vector = thrust::device_vector>; // NOLINT - // Faster to instantiate than caching_device_vector and invokes no synchronisation // Use this where vector functionality (e.g. resize) is not required template @@ -734,6 +508,11 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, return ToSpan(vec, offset, size); } +template +xgboost::common::Span ToSpan(DeviceUVector &vec) { + return {vec.data(), vec.size()}; +} + // thrust begin, similiar to std::begin template thrust::device_ptr tbegin(xgboost::HostDeviceVector& vector) { // NOLINT @@ -1117,6 +896,15 @@ class CUDAStream { void Sync() { this->View().Sync(); } }; +inline auto CachingThrustPolicy() { + XGBCachingDeviceAllocator alloc; +#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) + return thrust::cuda::par_nosync(alloc).on(DefaultStream()); +#else + return thrust::cuda::par(alloc).on(DefaultStream()); +#endif // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) +} + // Force nvcc to load data as constant template class LDGIterator { diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu new file mode 100644 index 000000000000..6c1f6e8a4376 --- /dev/null +++ b/src/common/device_vector.cu @@ -0,0 +1,25 @@ +/** + * Copyright 2017-2024, XGBoost contributors + */ +#include "../collective/communicator-inl.h" // for GetRank +#include "device_helpers.cuh" // for CurrentDevice +#include "device_vector.cuh" + +namespace dh { +namespace detail { +void ThrowOOMError(std::string const &err, size_t bytes) { + auto device = CurrentDevice(); + auto rank = xgboost::collective::GetRank(); + std::stringstream ss; + ss << "Memory allocation error on worker " << rank << ": " << err << "\n" + << "- Free memory: " << dh::AvailableMemory(device) << "\n" + << "- Requested memory: " << bytes << std::endl; + LOG(FATAL) << ss.str(); +} +} // namespace detail + +LoggingResource *GlobalLoggingResource() { + static auto mr{std::make_unique()}; + return mr.get(); +} +} // namespace dh diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh new file mode 100644 index 000000000000..e2d1f16cbb9e --- /dev/null +++ b/src/common/device_vector.cuh @@ -0,0 +1,313 @@ +/** + * Copyright 2017-2024, XGBoost Contributors + */ +#pragma once +#include // for device_malloc_allocator +#include // for device_ptr +#include // for device_vector + +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include // for device_uvector +#include // for device_memory_resource +#include // for get_current_device_resource +#include // for thrust_allocator +#include // for RMM_VERSION_MAJOR + +#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore + +#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) + +#error "Please use RMM version 0.18 or later" +#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18 +#error "Please use RMM version 0.18 or later" +#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) + +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + +#include // for size_t +#include // for CachingDeviceAllocator +#include // for CurrentDevice +#include // for map +#include // for unique_ptr + +#include "common.h" // for safe_cuda +#include "xgboost/logging.h" + +namespace dh { +namespace detail { +/** \brief Keeps track of global device memory allocations. Thread safe.*/ +class MemoryLogger { + // Information for a single device + struct DeviceStats { + std::size_t currently_allocated_bytes{0}; + size_t peak_allocated_bytes{0}; + size_t num_allocations{0}; + size_t num_deallocations{0}; + std::map device_allocations; + void RegisterAllocation(void *ptr, size_t n) { + device_allocations[ptr] = n; + currently_allocated_bytes += n; + peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes); + num_allocations++; + CHECK_GT(num_allocations, num_deallocations); + } + void RegisterDeallocation(void *ptr, size_t n, int current_device) { + auto itr = device_allocations.find(ptr); + if (itr == device_allocations.end()) { + LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device + << " that was never allocated\n" + << dmlc::StackTrace(); + } else { + num_deallocations++; + CHECK_LE(num_deallocations, num_allocations); + currently_allocated_bytes -= itr->second; + device_allocations.erase(itr); + } + } + }; + DeviceStats stats_; + std::mutex mutex_; + + public: + void RegisterAllocation(void *ptr, size_t n) { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + stats_.RegisterAllocation(ptr, n); + } + void RegisterDeallocation(void *ptr, size_t n) { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice()); + } + size_t PeakMemory() const { return stats_.peak_allocated_bytes; } + size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; } + void Clear() { stats_ = DeviceStats(); } + + void Log() { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + int current_device; + dh::safe_cuda(cudaGetDevice(¤t_device)); + LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " + << " ========"; + LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB"; + LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; + } +}; + +void ThrowOOMError(std::string const &err, size_t bytes); +} // namespace detail + +inline detail::MemoryLogger &GlobalMemoryLogger() { + static detail::MemoryLogger memory_logger; + return memory_logger; +} + +namespace detail { +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator; +#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + +/** + * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. + */ +template +struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; + using pointer = thrust::device_ptr; // NOLINT + template + struct rebind // NOLINT + { + using other = XGBDefaultDeviceAllocatorImpl; // NOLINT + }; + pointer allocate(size_t n) { // NOLINT + pointer ptr; + try { + ptr = SuperT::allocate(n); + dh::safe_cuda(cudaGetLastError()); + } catch (const std::exception &e) { + detail::ThrowOOMError(e.what(), n * sizeof(T)); + } + GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); + return ptr; + } + void deallocate(pointer ptr, size_t n) { // NOLINT + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); + SuperT::deallocate(ptr, n); + } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBDefaultDeviceAllocatorImpl() + : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {} +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +}; + +/** + * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless + * RMM pool allocator is enabled. Does not initialise memory on construction. + */ +template +struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; + using pointer = thrust::device_ptr; // NOLINT + template + struct rebind // NOLINT + { + using other = XGBCachingDeviceAllocatorImpl; // NOLINT + }; + cub::CachingDeviceAllocator &GetGlobalCachingAllocator() { + // Configure allocator with maximum cached bin size of ~1GB and no limit on + // maximum cached bytes + thread_local std::unique_ptr allocator{ + std::make_unique(2, 9, 29)}; + return *allocator; + } + pointer allocate(size_t n) { // NOLINT + pointer thrust_ptr; + if (use_cub_allocator_) { + T *raw_ptr{nullptr}; + auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&raw_ptr), + n * sizeof(T)); + if (errc != cudaSuccess) { + detail::ThrowOOMError("Caching allocator", n * sizeof(T)); + } + thrust_ptr = pointer(raw_ptr); + } else { + try { + thrust_ptr = SuperT::allocate(n); + dh::safe_cuda(cudaGetLastError()); + } catch (const std::exception &e) { + detail::ThrowOOMError(e.what(), n * sizeof(T)); + } + } + GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); + return thrust_ptr; + } + void deallocate(pointer ptr, size_t n) { // NOLINT + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); + if (use_cub_allocator_) { + GetGlobalCachingAllocator().DeviceFree(ptr.get()); + } else { + SuperT::deallocate(ptr, n); + } + } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBCachingDeviceAllocatorImpl() + : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()), + use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {} +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBOOST_DEVICE void construct(T *) {} // NOLINT + private: + bool use_cub_allocator_{true}; +}; +} // namespace detail + +// Declare xgboost allocators +// Replacement of allocator with custom backend should occur here +template +using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; + +/** Be careful that the initialization constructor is a no-op, which means calling + * `vec.resize(n)` won't initialize the memory region to 0. Instead use + * `vec.resize(n, 0)` + */ +template +using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; + +/** @brief Specialisation of thrust device vector using custom allocator. */ +template +using device_vector = thrust::device_vector>; // NOLINT +template +using caching_device_vector = thrust::device_vector>; // NOLINT + +#if defined(XGBOOST_USE_RMM) +/** + * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead. + */ +class LoggingResource : public rmm::mr::device_memory_resource { + rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()}; + + public: + LoggingResource() = default; + ~LoggingResource() override = default; + LoggingResource(LoggingResource const &) = delete; + LoggingResource &operator=(LoggingResource const &) = delete; + LoggingResource(LoggingResource &&) noexcept = default; + LoggingResource &operator=(LoggingResource &&) noexcept = default; + + [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { // NOLINT + return mr_; + } + [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept { // NOLINT + return mr_; + } + + void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { // NOLINT + try { + auto const ptr = mr_->allocate(bytes, stream); + GlobalMemoryLogger().RegisterAllocation(ptr, bytes); + return ptr; + } catch (rmm::bad_alloc const &e) { + detail::ThrowOOMError(e.what(), bytes); + } + return nullptr; + } + + void do_deallocate(void *ptr, std::size_t bytes, // NOLINT + rmm::cuda_stream_view stream) override { + mr_->deallocate(ptr, bytes, stream); + GlobalMemoryLogger().RegisterDeallocation(ptr, bytes); + } + + [[nodiscard]] bool do_is_equal( // NOLINT + device_memory_resource const &other) const noexcept override { + if (this == &other) { + return true; + } + auto const *cast = dynamic_cast(&other); + if (cast == nullptr) { + return mr_->is_equal(other); + } + return get_upstream_resource() == cast->get_upstream_resource(); + } +}; + +LoggingResource *GlobalLoggingResource(); + +/** + * @brief Container class that doesn't initialize the data. + */ +template +class DeviceUVector : public rmm::device_uvector { + using Super = rmm::device_uvector; + + public: + static constexpr bool NeedInit() { return true; } + + public: + explicit DeviceUVector(std::size_t n) + : Super{n, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} +}; + +#else + +/** + * @brief Without RMM, the initialization will happen. + */ +template +class DeviceUVector : thrust::device_vector> { + public: + static constexpr bool NeedInit() { return false; } +}; + +#endif // defined(XGBOOST_USE_RMM) +} // namespace dh diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index f4973c0428f0..de9e0614a38e 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -114,6 +114,11 @@ void HostDeviceVector::Resize(size_t new_size, T v) { impl_->Vec().resize(new_size, v); } +template +void HostDeviceVector::Resize(size_t new_size) { + impl_->Vec().resize(new_size, T{}); +} + template void HostDeviceVector::Fill(T v) { std::fill(HostVector().begin(), HostVector().end(), v); diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 99448df21b7e..87a061ecc45a 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -1,22 +1,33 @@ /** * Copyright 2017-2023 by XGBoost contributors */ -#include #include +#include #include #include +#include +#include +#include "device_helpers.cuh" +#include "device_vector.cuh" #include "xgboost/data.h" #include "xgboost/host_device_vector.h" #include "xgboost/tree_model.h" -#include "device_helpers.cuh" namespace xgboost { // the handler to call instead of cudaSetDevice; only used for testing static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT +template +void InitVectorIfNeeded(std::size_t offset, T const& v, dh::DeviceUVector* out) { + auto& data = *out; + if (std::remove_reference_t::NeedInit()) { + thrust::fill(dh::CachingThrustPolicy(), data.begin() + offset, data.end(), v); + } +} + void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; } @@ -28,7 +39,8 @@ class HostDeviceVectorImpl { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(size, v); + data_d_->resize(size, rmm::cuda_stream_per_thread); + InitVectorIfNeeded(0, v, data_d_.get()); } else { data_h_.resize(size, v); } @@ -66,22 +78,22 @@ class HostDeviceVectorImpl { T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); - return data_d_->data().get(); + return thrust::raw_pointer_cast(data_d_->data()); } const T* ConstDevicePointer() { LazySyncDevice(GPUAccess::kRead); - return data_d_->data().get(); + return thrust::raw_pointer_cast(data_d_->data()); } common::Span DeviceSpan() { LazySyncDevice(GPUAccess::kWrite); - return {data_d_->data().get(), Size()}; + return {this->DevicePointer(), Size()}; } common::Span ConstDeviceSpan() { LazySyncDevice(GPUAccess::kRead); - return {data_d_->data().get(), Size()}; + return {this->ConstDevicePointer(), Size()}; } void Fill(T v) { // NOLINT @@ -91,7 +103,7 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto s_data = dh::ToSpan(*data_d_); - dh::LaunchN(data_d_->size(), + dh::LaunchN(data_d_->size(), dh::DefaultStream(), [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; }); } } @@ -138,10 +150,9 @@ class HostDeviceVectorImpl { auto ptr = other->ConstDevicePointer(); SetDevice(); CHECK_EQ(this->Device(), other->Device()); - dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, - ptr, - other->Size() * sizeof(T), - cudaMemcpyDeviceToDevice)); + dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, + other->Size() * sizeof(T), cudaMemcpyDeviceToDevice, + dh::DefaultStream())); } } @@ -171,20 +182,55 @@ class HostDeviceVectorImpl { } } - void Resize(size_t new_size, T v) { - if (new_size == Size()) { return; } + auto ResizeImpl(std::size_t new_size) { if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(new_size, v); + auto old_size = data_d_->size(); + data_d_->resize(new_size, rmm::cuda_stream_per_thread); + return std::make_pair(old_size, DeviceOrd::kCUDA); } else { // resize on host LazySyncHost(GPUAccess::kNone); - data_h_.resize(new_size, v); + auto old_size = data_h_.size(); + data_h_.resize(new_size); + return std::make_pair(old_size, DeviceOrd::kCPU); + } + } + + void Resize(std::size_t new_size, T v) { + if (new_size == Size()) { + return; + } + // Track the size of init. + auto [old_size, device] = this->ResizeImpl(new_size); + if (new_size <= old_size) { + return; + } + switch (device) { + case DeviceOrd::kCPU: { + std::fill(data_h_.begin() + old_size, data_h_.end(), v); + break; + } + case DeviceOrd::kCUDA: { + InitVectorIfNeeded(old_size, v, data_d_.get()); + break; + } + default: { + LOG(FATAL) << "Unexpected device type."; + break; + } } } + void Resize(std::size_t new_size) { + if (new_size == Size()) { + return; + } + this->ResizeImpl(new_size); + } + void LazySyncHost(GPUAccess access) { if (HostCanAccess(access)) { return; } if (HostCanRead()) { @@ -195,10 +241,8 @@ class HostDeviceVectorImpl { gpu_access_ = access; if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); - dh::safe_cuda(cudaMemcpy(data_h_.data(), - data_d_->data().get(), - data_d_->size() * sizeof(T), - cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()), + data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); } void LazySyncDevice(GPUAccess access) { @@ -211,10 +255,9 @@ class HostDeviceVectorImpl { // data is on the host LazyResizeDevice(data_h_.size()); SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), - data_h_.data(), - data_d_->size() * sizeof(T), - cudaMemcpyHostToDevice)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(), + data_d_->size() * sizeof(T), cudaMemcpyHostToDevice, + dh::DefaultStream())); gpu_access_ = access; } @@ -229,7 +272,7 @@ class HostDeviceVectorImpl { private: DeviceOrd device_{DeviceOrd::CPU()}; std::vector data_h_{}; - std::unique_ptr> data_d_{}; + std::unique_ptr> data_d_{}; GPUAccess gpu_access_{GPUAccess::kNone}; void CopyToDevice(HostDeviceVectorImpl* other) { @@ -239,8 +282,10 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), - data_d_->size() * sizeof(T), cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), + thrust::raw_pointer_cast(other->data_d_->data()), + data_d_->size() * sizeof(T), cudaMemcpyDefault, + dh::DefaultStream())); } } @@ -248,14 +293,15 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, - data_d_->size() * sizeof(T), cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin, + data_d_->size() * sizeof(T), cudaMemcpyDefault, + dh::DefaultStream())); } void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); - data_d_->resize(new_size); + data_d_->resize(new_size, rmm::cuda_stream_per_thread); } void SetDevice() { @@ -267,7 +313,7 @@ class HostDeviceVectorImpl { } if (!data_d_) { - data_d_.reset(new dh::device_vector); + data_d_.reset(new dh::DeviceUVector{0}); } } }; @@ -396,6 +442,11 @@ void HostDeviceVector::SetDevice(DeviceOrd device) const { impl_->SetDevice(device); } +template +void HostDeviceVector::Resize(std::size_t new_size) { + impl_->Resize(new_size); +} + template void HostDeviceVector::Resize(size_t new_size, T v) { impl_->Resize(new_size, v); diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 898da03a0dce..bfcfe4cccba0 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -4,12 +4,14 @@ #ifndef XGBOOST_COMMON_QUANTILE_CUH_ #define XGBOOST_COMMON_QUANTILE_CUH_ -#include "xgboost/span.h" -#include "xgboost/data.h" +#include // for any_of + +#include "categorical.h" #include "device_helpers.cuh" #include "quantile.h" #include "timer.h" -#include "categorical.h" +#include "xgboost/data.h" +#include "xgboost/span.h" namespace xgboost { namespace common { diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 59199b092839..4155a7084481 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -1,7 +1,8 @@ /** * Copyright 2021-2024, XGBoost Contributors */ -#include // for copy +#include // for copy +#include // for any_of #include #include diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index aa4f8fa27218..3d9c4e734fb4 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -840,9 +840,7 @@ class GPUHistMaker : public TreeUpdater { out["hist_train_param"] = ToJson(hist_maker_param_); } - ~GPUHistMaker() { // NOLINT - dh::GlobalMemoryLogger().Log(); - } + ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); } void Update(TrainParam const* param, linalg::Matrix* gpair, DMatrix* dmat, common::Span> out_position, diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu new file mode 100644 index 000000000000..d7d66d87d997 --- /dev/null +++ b/tests/cpp/common/test_device_vector.cu @@ -0,0 +1,20 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#include + +#include "../../../src/common/device_vector.cuh" +#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore + +namespace dh { +TEST(DeviceUVector, Basic) { + GlobalMemoryLogger().Clear(); + std::int32_t verbosity{3}; + std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); + DeviceUVector uvec{12}; + auto peak = GlobalMemoryLogger().PeakMemory(); + auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); + ASSERT_EQ(peak, n_bytes); + std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); +} +} // namespace dh diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index 57e945cba9be..bd138f2fc2b7 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -181,4 +181,34 @@ TEST(HostDeviceVector, Empty) { ASSERT_FALSE(another.Empty()); ASSERT_TRUE(vec.Empty()); } + +TEST(HostDeviceVector, Resize) { + auto check = [&](HostDeviceVector const& vec) { + auto const& h_vec = vec.ConstHostSpan(); + for (std::size_t i = 0; i < 4; ++i) { + ASSERT_EQ(h_vec[i], i + 1); + } + for (std::size_t i = 4; i < vec.Size(); ++i) { + ASSERT_EQ(h_vec[i], 3.0); + } + }; + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + vec.SetDevice(DeviceOrd::CUDA(0)); + vec.ConstDeviceSpan(); + ASSERT_TRUE(vec.DeviceCanRead()); + ASSERT_FALSE(vec.DeviceCanWrite()); + vec.DeviceSpan(); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + ASSERT_TRUE(vec.HostCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.HostCanWrite()); + check(vec); + } +} } // namespace xgboost::common diff --git a/tests/cpp/data/test_array_interface.h b/tests/cpp/data/test_array_interface.h index 78bce76f53e7..dfe4f5a3ec5c 100644 --- a/tests/cpp/data/test_array_interface.h +++ b/tests/cpp/data/test_array_interface.h @@ -1,15 +1,14 @@ -// Copyright (c) 2019 by Contributors +/** + * Copyright 2019-2024, XGBoost Contributors + */ #include +#include +#include // for device +#include // for sequence #include #include -#include - -#include -#include "../../../src/common/bitfield.h" -#include "../../../src/common/device_helpers.cuh" namespace xgboost { - template Json GenerateDenseColumn(std::string const& typestr, size_t kRows, thrust::device_vector* out_d_data) { From 1853e32cd0e671b6bbe882ef0af6d1931d155d64 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 4 Jul 2024 22:18:20 +0800 Subject: [PATCH 2/5] resize. --- src/common/device_helpers.cuh | 2 +- src/common/device_vector.cu | 2 ++ src/common/device_vector.cuh | 28 +++++++++++++-- src/common/host_device_vector.cu | 47 +++++++------------------- tests/cpp/common/test_device_vector.cu | 3 +- 5 files changed, 42 insertions(+), 40 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 98a76d72a263..1754c9507036 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -510,7 +510,7 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, template xgboost::common::Span ToSpan(DeviceUVector &vec) { - return {vec.data(), vec.size()}; + return {thrust::raw_pointer_cast(vec.data()), vec.size()}; } // thrust begin, similiar to std::begin diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu index 6c1f6e8a4376..50922d8f978e 100644 --- a/src/common/device_vector.cu +++ b/src/common/device_vector.cu @@ -18,8 +18,10 @@ void ThrowOOMError(std::string const &err, size_t bytes) { } } // namespace detail +#if defined(XGBOOST_USE_RMM) LoggingResource *GlobalLoggingResource() { static auto mr{std::make_unique()}; return mr.get(); } +#endif // defined(XGBOOST_USE_RMM) } // namespace dh diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index e2d1f16cbb9e..5e4f327efa56 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -8,6 +8,7 @@ #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 #include // for device_uvector +#include // for exec_policy_nosync #include // for device_memory_resource #include // for get_current_device_resource #include // for thrust_allocator @@ -294,8 +295,20 @@ class DeviceUVector : public rmm::device_uvector { static constexpr bool NeedInit() { return true; } public: - explicit DeviceUVector(std::size_t n) - : Super{n, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} + DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} + + void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); } + void Resize(std::size_t n, T const &v) { + auto orig = this->size(); + Super::resize(n, rmm::cuda_stream_per_thread); + if (orig < n) { + thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v); + } + } + + private: + // undefined private, cannot be accessed. + void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT }; #else @@ -304,9 +317,18 @@ class DeviceUVector : public rmm::device_uvector { * @brief Without RMM, the initialization will happen. */ template -class DeviceUVector : thrust::device_vector> { +class DeviceUVector : public thrust::device_vector> { + using Super = thrust::device_vector>; + public: static constexpr bool NeedInit() { return false; } + + void Resize(std::size_t n) { Super::resize(n); } + void Resize(std::size_t n, T const &v) { Super::resize(n, v); } + + private: + // undefined private, cannot be accessed. + void resize(std::size_t n, T const &v = T{}); // NOLINT }; #endif // defined(XGBOOST_USE_RMM) diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 87a061ecc45a..a075ee84bb80 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -1,19 +1,17 @@ /** - * Copyright 2017-2023 by XGBoost contributors + * Copyright 2017-2024, XGBoost contributors */ -#include #include #include +#include // for size_t #include -#include -#include #include "device_helpers.cuh" -#include "device_vector.cuh" +#include "device_vector.cuh" // for DeviceUVector #include "xgboost/data.h" #include "xgboost/host_device_vector.h" -#include "xgboost/tree_model.h" +#include "xgboost/tree_model.h" // for RegTree namespace xgboost { @@ -39,8 +37,7 @@ class HostDeviceVectorImpl { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(size, rmm::cuda_stream_per_thread); - InitVectorIfNeeded(0, v, data_d_.get()); + data_d_->Resize(size, v); } else { data_h_.resize(size, v); } @@ -182,20 +179,19 @@ class HostDeviceVectorImpl { } } - auto ResizeImpl(std::size_t new_size) { + template + auto ResizeImpl(std::size_t new_size, U&&... args) { if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; SetDevice(); auto old_size = data_d_->size(); - data_d_->resize(new_size, rmm::cuda_stream_per_thread); - return std::make_pair(old_size, DeviceOrd::kCUDA); + data_d_->Resize(new_size, std::forward(args)...); } else { // resize on host LazySyncHost(GPUAccess::kNone); auto old_size = data_h_.size(); - data_h_.resize(new_size); - return std::make_pair(old_size, DeviceOrd::kCPU); + data_h_.resize(new_size, std::forward(args)...); } } @@ -203,25 +199,7 @@ class HostDeviceVectorImpl { if (new_size == Size()) { return; } - // Track the size of init. - auto [old_size, device] = this->ResizeImpl(new_size); - if (new_size <= old_size) { - return; - } - switch (device) { - case DeviceOrd::kCPU: { - std::fill(data_h_.begin() + old_size, data_h_.end(), v); - break; - } - case DeviceOrd::kCUDA: { - InitVectorIfNeeded(old_size, v, data_d_.get()); - break; - } - default: { - LOG(FATAL) << "Unexpected device type."; - break; - } - } + this->ResizeImpl(new_size, v); } void Resize(std::size_t new_size) { @@ -301,7 +279,7 @@ class HostDeviceVectorImpl { void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); - data_d_->resize(new_size, rmm::cuda_stream_per_thread); + data_d_->Resize(new_size); } void SetDevice() { @@ -313,7 +291,7 @@ class HostDeviceVectorImpl { } if (!data_d_) { - data_d_.reset(new dh::DeviceUVector{0}); + data_d_.reset(new dh::DeviceUVector{}); } } }; @@ -478,5 +456,4 @@ template class HostDeviceVector; */ template class HostDeviceVector; #endif // defined(__APPLE__) - } // namespace xgboost diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index d7d66d87d997..95da4ef3f167 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -11,7 +11,8 @@ TEST(DeviceUVector, Basic) { GlobalMemoryLogger().Clear(); std::int32_t verbosity{3}; std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); - DeviceUVector uvec{12}; + DeviceUVector uvec; + uvec.Resize(12); auto peak = GlobalMemoryLogger().PeakMemory(); auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); ASSERT_EQ(peak, n_bytes); From 3c9579c026383d42ab0d359e539b33bfa6389ceb Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 4 Jul 2024 23:33:27 +0800 Subject: [PATCH 3/5] Cleanup. --- src/common/device_vector.cuh | 5 ---- src/common/host_device_vector.cu | 26 ++++----------------- tests/cpp/common/test_host_device_vector.cu | 9 ++++++- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 5e4f327efa56..35386856cc9c 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -291,9 +291,6 @@ template class DeviceUVector : public rmm::device_uvector { using Super = rmm::device_uvector; - public: - static constexpr bool NeedInit() { return true; } - public: DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} @@ -321,8 +318,6 @@ class DeviceUVector : public thrust::device_vector> { using Super = thrust::device_vector>; public: - static constexpr bool NeedInit() { return false; } - void Resize(std::size_t n) { Super::resize(n); } void Resize(std::size_t n, T const &v) { Super::resize(n, v); } diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index a075ee84bb80..967af9a632ca 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -18,14 +18,6 @@ namespace xgboost { // the handler to call instead of cudaSetDevice; only used for testing static void (*cudaSetDeviceHandler)(int) = nullptr; // NOLINT -template -void InitVectorIfNeeded(std::size_t offset, T const& v, dh::DeviceUVector* out) { - auto& data = *out; - if (std::remove_reference_t::NeedInit()) { - thrust::fill(dh::CachingThrustPolicy(), data.begin() + offset, data.end(), v); - } -} - void SetCudaSetDeviceHandler(void (*handler)(int)) { cudaSetDeviceHandler = handler; } @@ -181,6 +173,9 @@ class HostDeviceVectorImpl { template auto ResizeImpl(std::size_t new_size, U&&... args) { + if (new_size == Size()) { + return; + } if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; @@ -195,19 +190,8 @@ class HostDeviceVectorImpl { } } - void Resize(std::size_t new_size, T v) { - if (new_size == Size()) { - return; - } - this->ResizeImpl(new_size, v); - } - - void Resize(std::size_t new_size) { - if (new_size == Size()) { - return; - } - this->ResizeImpl(new_size); - } + void Resize(std::size_t new_size, T v) { this->ResizeImpl(new_size, v); } + void Resize(std::size_t new_size) { this->ResizeImpl(new_size); } void LazySyncHost(GPUAccess access) { if (HostCanAccess(access)) { return; } diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index bd138f2fc2b7..a0aa5fa11fce 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 XGBoost contributors + * Copyright 2018-2024, XGBoost contributors */ #include #include @@ -203,6 +203,13 @@ TEST(HostDeviceVector, Resize) { ASSERT_TRUE(vec.DeviceCanWrite()); check(vec); } + { + HostDeviceVector vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)}; + ASSERT_TRUE(vec.DeviceCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } { HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; ASSERT_TRUE(vec.HostCanWrite()); From b225a7c2208678139396433166d6715947235d37 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 5 Jul 2024 01:50:12 +0800 Subject: [PATCH 4/5] Fix. --- src/common/quantile.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index bfcfe4cccba0..3dd393755852 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -102,9 +102,9 @@ class SketchContainer { CHECK(device.IsCUDA()); // Initialize Sketches for this dmatrix this->columns_ptr_.SetDevice(device_); - this->columns_ptr_.Resize(num_columns + 1); + this->columns_ptr_.Resize(num_columns + 1, 0); this->columns_ptr_b_.SetDevice(device_); - this->columns_ptr_b_.Resize(num_columns + 1); + this->columns_ptr_b_.Resize(num_columns + 1, 0); this->feature_types_.Resize(feature_types.Size()); this->feature_types_.Copy(feature_types); From a016a8ed91e8778d530cfb10412dfa6f1b041b71 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 11 Jul 2024 15:23:25 +0800 Subject: [PATCH 5/5] Small cleanup. --- include/xgboost/host_device_vector.h | 2 +- src/common/host_device_vector.cu | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h index c4d12da53079..36c7ed32b83d 100644 --- a/include/xgboost/host_device_vector.h +++ b/include/xgboost/host_device_vector.h @@ -137,7 +137,7 @@ class HostDeviceVector { void Resize(std::size_t new_size); /** @brief Resize and initialize the data if the new size is larger than the old size. */ - void Resize(size_t new_size, T v); + void Resize(std::size_t new_size, T v); using value_type = T; // NOLINT diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 967af9a632ca..16a1aa027f09 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -129,7 +129,7 @@ class HostDeviceVectorImpl { void Extend(HostDeviceVectorImpl* other) { auto ori_size = this->Size(); - this->Resize(ori_size + other->Size(), T()); + this->Resize(ori_size + other->Size(), T{}); if (HostCanWrite() && other->HostCanRead()) { auto& h_vec = this->HostVector(); auto& other_vec = other->HostVector(); @@ -172,7 +172,7 @@ class HostDeviceVectorImpl { } template - auto ResizeImpl(std::size_t new_size, U&&... args) { + auto Resize(std::size_t new_size, U&&... args) { if (new_size == Size()) { return; } @@ -190,9 +190,6 @@ class HostDeviceVectorImpl { } } - void Resize(std::size_t new_size, T v) { this->ResizeImpl(new_size, v); } - void Resize(std::size_t new_size) { this->ResizeImpl(new_size); } - void LazySyncHost(GPUAccess access) { if (HostCanAccess(access)) { return; } if (HostCanRead()) { @@ -410,7 +407,7 @@ void HostDeviceVector::Resize(std::size_t new_size) { } template -void HostDeviceVector::Resize(size_t new_size, T v) { +void HostDeviceVector::Resize(std::size_t new_size, T v) { impl_->Resize(new_size, v); }