From 7e770a865303433c225707fbf9d6c27ffdf96388 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 10 Aug 2023 02:24:40 +0800 Subject: [PATCH] Use `realloc` for histogram cache and expose the cache limit. --- doc/parameter.rst | 9 +++++ python-package/xgboost/testing/params.py | 2 +- src/common/ref_resource_view.h | 37 ++++++++++++++++++++- src/tree/hist/hist_cache.h | 22 +++++++----- src/tree/hist/histogram.h | 2 +- src/tree/hist/param.h | 4 +-- src/tree/updater_gpu_hist.cu | 3 ++ tests/cpp/tree/hist/test_evaluate_splits.cc | 8 ++--- tests/cpp/tree/hist/test_histogram.cc | 2 +- tests/cpp/tree/test_evaluate_splits.h | 2 +- 10 files changed, 71 insertions(+), 20 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index 1b1bb80a41cd..fdb4b8357867 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -226,6 +226,15 @@ Parameters for Tree Booster - ``one_output_per_tree``: One model for each target. - ``multi_output_tree``: Use multi-target trees. +* ``max_cached_hist_node``, [default = 65536] + + Maximum number of cached nodes for CPU histogram. + + .. versionadded:: 2.0.0 + + - For most of the cases this parameter should not be set except for growing deep trees + on CPU. + .. _cat-param: Parameters for Categorical Feature diff --git a/python-package/xgboost/testing/params.py b/python-package/xgboost/testing/params.py index 4ed8f4c4e62d..6b47f4a01a0c 100644 --- a/python-package/xgboost/testing/params.py +++ b/python-package/xgboost/testing/params.py @@ -42,7 +42,7 @@ ) hist_cache_strategy = strategies.fixed_dictionaries( - {"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])} + {"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])} ) hist_multi_parameter_strategy = strategies.fixed_dictionaries( diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h index 2804d79eb7ab..0fadf846dd5e 100644 --- a/src/common/ref_resource_view.h +++ b/src/common/ref_resource_view.h @@ -35,6 +35,13 @@ class RefResourceView { size_type size_{0}; std::shared_ptr mem_{nullptr}; + protected: + void Init(value_type* ptr, size_type size, std::shared_ptr mem) { + ptr_ = ptr; + size_ = size; + mem_ = std::move(mem); + } + public: RefResourceView(value_type* ptr, size_type n, std::shared_ptr mem) : ptr_{ptr}, size_{n}, mem_{std::move(mem)} { @@ -60,11 +67,11 @@ class RefResourceView { RefResourceView() = default; RefResourceView(RefResourceView const& that) = delete; - RefResourceView(RefResourceView&& that) = delete; RefResourceView& operator=(RefResourceView const& that) = delete; /** * @brief We allow move assignment for lazy initialization. */ + RefResourceView(RefResourceView&& that) = default; RefResourceView& operator=(RefResourceView&& that) = default; [[nodiscard]] size_type size() const { return size_; } // NOLINT @@ -154,5 +161,33 @@ template auto resource = std::make_shared(n_elements * sizeof(T)); return RefResourceView{resource->DataAs(), n_elements, resource, init}; } + +template +class ReallocVector : public RefResourceView { + static_assert(!std::is_reference_v); + static_assert(!std::is_const_v); + static_assert(std::is_trivially_copyable_v); + + using Upper = RefResourceView; + using size_type = typename Upper::size_type; // NOLINT + using value_type = typename Upper::value_type; // NOLINT + + public: + ReallocVector() : RefResourceView{MakeFixedVecWithMalloc(0, T{})} {} + + ReallocVector(size_type n, value_type const& init) + : RefResourceView{MakeFixedVecWithMalloc(n, init)} {} + ReallocVector(ReallocVector const& that) = delete; + ReallocVector(ReallocVector&& that) = delete; + ReallocVector& operator=(ReallocVector const& that) = delete; + ReallocVector& operator=(ReallocVector&& that) = delete; + + void Resize(typename Upper::size_type new_size) { + auto resource = std::dynamic_pointer_cast(this->Resource()); + CHECK(resource); + resource->Resize(new_size * sizeof(T)); + this->Init(resource->template DataAs(), new_size, resource); + } +}; } // namespace xgboost::common #endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_ diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h index 79e5d9bad673..8a2ba193af0c 100644 --- a/src/tree/hist/hist_cache.h +++ b/src/tree/hist/hist_cache.h @@ -5,12 +5,14 @@ #define XGBOOST_TREE_HIST_HIST_CACHE_H_ #include // for size_t #include // for map +#include // for unique_ptr #include // for vector -#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow -#include "xgboost/base.h" // for bst_node_t, bst_bin_t -#include "xgboost/logging.h" // for CHECK_GT -#include "xgboost/span.h" // for Span +#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow +#include "../../common/ref_resource_view.h" // for ReallocVector +#include "xgboost/base.h" // for bst_node_t, bst_bin_t +#include "xgboost/logging.h" // for CHECK_GT +#include "xgboost/span.h" // for Span namespace xgboost::tree { /** @@ -32,7 +34,8 @@ class BoundedHistCollection { std::size_t current_size_{0}; // stores the histograms in a contiguous buffer - std::vector data_; + using Vec = common::ReallocVector; + std::unique_ptr data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique // number of histogram bins across all features bst_bin_t n_total_bins_{0}; @@ -42,13 +45,14 @@ class BoundedHistCollection { bool has_exceeded_{false}; public: + BoundedHistCollection() = default; common::GHistRow operator[](std::size_t idx) { auto offset = node_map_.at(idx); - return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_); + return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_); } common::ConstGHistRow operator[](std::size_t idx) const { auto offset = node_map_.at(idx); - return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_); + return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_); } void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) { n_total_bins_ = n_total_bins; @@ -81,8 +85,8 @@ class BoundedHistCollection { auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size(); auto alloc_size = n_new_nodes * n_total_bins_; auto new_size = alloc_size + current_size_; - if (new_size > data_.size()) { - data_.resize(new_size); + if (new_size > data_->size()) { + data_->Resize(new_size); } for (auto nidx : nodes_to_build) { node_map_[nidx] = current_size_; diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h index 54c71688766c..f378c78089db 100644 --- a/src/tree/hist/histogram.h +++ b/src/tree/hist/histogram.h @@ -63,7 +63,7 @@ class HistogramBuilder { bool is_col_split, HistMakerTrainParam const *param) { n_threads_ = ctx->Threads(); param_ = p; - hist_.Reset(total_bins, param->internal_max_cached_hist_node); + hist_.Reset(total_bins, param->max_cached_hist_node); buffer_.Init(total_bins); is_distributed_ = is_distributed; is_col_split_ = is_col_split; diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h index 0f2f4ac00ada..8757b65e6610 100644 --- a/src/tree/hist/param.h +++ b/src/tree/hist/param.h @@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter { constexpr static std::size_t DefaultNodes() { return static_cast(1) << 16; } bool debug_synchronize{false}; - std::size_t internal_max_cached_hist_node{DefaultNodes()}; + std::size_t max_cached_hist_node{DefaultNodes()}; void CheckTreesSynchronized(RegTree const* local_tree) const; @@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter { DMLC_DECLARE_FIELD(debug_synchronize) .set_default(false) .describe("Check if all distributed tree are identical after tree construction."); - DMLC_DECLARE_FIELD(internal_max_cached_hist_node) + DMLC_DECLARE_FIELD(max_cached_hist_node) .set_default(DefaultNodes()) .set_lower_bound(1) .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage."); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 0403c7881df7..5cce89e2cd57 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Approx]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); + if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) { + LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU."; + } dh::CheckComputeCapability(); initialised_ = false; diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index 1685a3c801dc..095284a38850 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) { row_set_collection.Init(); HistMakerTrainParam hist_param; - hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node); + hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); common::BuildHist(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column); @@ -118,7 +118,7 @@ TEST(HistMultiEvaluator, Evaluate) { linalg::Vector root_sum({2}, Context::kCpuId); for (bst_target_t t{0}; t < n_targets; ++t) { auto &hist = histogram[t]; - hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node); + hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; node_hist[0] = {-0.5, 0.5}; @@ -235,7 +235,7 @@ auto CompareOneHotAndPartition(bool onehot) { entries.front().nid = 0; entries.front().depth = 0; - hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node); + hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; @@ -265,7 +265,7 @@ TEST(HistEvaluator, Categorical) { TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) { BoundedHistCollection hist; HistMakerTrainParam hist_param; - hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node); + hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node); hist.AllocateHistograms({0}); auto node_hist = hist[0]; ASSERT_EQ(node_hist.size(), feature_histogram_.size()); diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index b90b43101ebe..8949b5f4be2a 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -516,7 +516,7 @@ class OverflowTest : public ::testing::TestWithParam> { Context ctx; HistMakerTrainParam hist_param; if (limit) { - hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}}); + hist_param.Init(Args{{"max_cached_hist_node", "1"}}); } std::shared_ptr Xy = diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index 04da4777dc3d..6cb75e23b0dd 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -59,7 +59,7 @@ class TestPartitionBasedSplit : public ::testing::Test { cuts_.min_vals_.Resize(1); HistMakerTrainParam hist_param; - hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node); + hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node); hist_.AllocateHistograms({0}); auto node_hist = hist_[0];