Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use realloc for histogram cache and expose the cache limit. #9455

Merged
merged 1 commit into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/parameter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,15 @@ Parameters for Tree Booster
- ``one_output_per_tree``: One model for each target.
- ``multi_output_tree``: Use multi-target trees.

* ``max_cached_hist_node``, [default = 65536]

Maximum number of cached nodes for CPU histogram.

.. versionadded:: 2.0.0

- For most of the cases this parameter should not be set except for growing deep trees
on CPU.

.. _cat-param:

Parameters for Categorical Feature
Expand Down
2 changes: 1 addition & 1 deletion python-package/xgboost/testing/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
)

hist_cache_strategy = strategies.fixed_dictionaries(
{"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
{"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
)

hist_multi_parameter_strategy = strategies.fixed_dictionaries(
Expand Down
37 changes: 36 additions & 1 deletion src/common/ref_resource_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ class RefResourceView {
size_type size_{0};
std::shared_ptr<common::ResourceHandler> mem_{nullptr};

protected:
void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
ptr_ = ptr;
size_ = size;
mem_ = std::move(mem);
}

public:
RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
: ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
Expand All @@ -60,11 +67,11 @@ class RefResourceView {

RefResourceView() = default;
RefResourceView(RefResourceView const& that) = delete;
RefResourceView(RefResourceView&& that) = delete;
RefResourceView& operator=(RefResourceView const& that) = delete;
/**
* @brief We allow move assignment for lazy initialization.
*/
RefResourceView(RefResourceView&& that) = default;
RefResourceView& operator=(RefResourceView&& that) = default;

[[nodiscard]] size_type size() const { return size_; } // NOLINT
Expand Down Expand Up @@ -154,5 +161,33 @@ template <typename T>
auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
}

template <typename T>
class ReallocVector : public RefResourceView<T> {
static_assert(!std::is_reference_v<T>);
static_assert(!std::is_const_v<T>);
static_assert(std::is_trivially_copyable_v<T>);

using Upper = RefResourceView<T>;
using size_type = typename Upper::size_type; // NOLINT
using value_type = typename Upper::value_type; // NOLINT

public:
ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}

ReallocVector(size_type n, value_type const& init)
: RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
ReallocVector(ReallocVector const& that) = delete;
ReallocVector(ReallocVector&& that) = delete;
ReallocVector& operator=(ReallocVector const& that) = delete;
ReallocVector& operator=(ReallocVector&& that) = delete;

void Resize(typename Upper::size_type new_size) {
auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
CHECK(resource);
resource->Resize(new_size * sizeof(T));
this->Init(resource->template DataAs<T>(), new_size, resource);
}
};
} // namespace xgboost::common
#endif // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
22 changes: 13 additions & 9 deletions src/tree/hist/hist_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
#include <cstddef> // for size_t
#include <map> // for map
#include <memory> // for unique_ptr
#include <vector> // for vector

#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
#include "../../common/ref_resource_view.h" // for ReallocVector
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
#include "xgboost/logging.h" // for CHECK_GT
#include "xgboost/span.h" // for Span

namespace xgboost::tree {
/**
Expand All @@ -32,7 +34,8 @@ class BoundedHistCollection {
std::size_t current_size_{0};

// stores the histograms in a contiguous buffer
std::vector<GradientPairPrecise> data_;
using Vec = common::ReallocVector<GradientPairPrecise>;
std::unique_ptr<Vec> data_{new Vec{}}; // nvcc 12.1 trips over std::make_unique

// number of histogram bins across all features
bst_bin_t n_total_bins_{0};
Expand All @@ -42,13 +45,14 @@ class BoundedHistCollection {
bool has_exceeded_{false};

public:
BoundedHistCollection() = default;
common::GHistRow operator[](std::size_t idx) {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
common::ConstGHistRow operator[](std::size_t idx) const {
auto offset = node_map_.at(idx);
return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
}
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
n_total_bins_ = n_total_bins;
Expand Down Expand Up @@ -81,8 +85,8 @@ class BoundedHistCollection {
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
auto alloc_size = n_new_nodes * n_total_bins_;
auto new_size = alloc_size + current_size_;
if (new_size > data_.size()) {
data_.resize(new_size);
if (new_size > data_->size()) {
data_->Resize(new_size);
}
for (auto nidx : nodes_to_build) {
node_map_[nidx] = current_size_;
Expand Down
2 changes: 1 addition & 1 deletion src/tree/hist/histogram.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class HistogramBuilder {
bool is_col_split, HistMakerTrainParam const *param) {
n_threads_ = ctx->Threads();
param_ = p;
hist_.Reset(total_bins, param->internal_max_cached_hist_node);
hist_.Reset(total_bins, param->max_cached_hist_node);
buffer_.Init(total_bins);
is_distributed_ = is_distributed;
is_col_split_ = is_col_split;
Expand Down
4 changes: 2 additions & 2 deletions src/tree/hist/param.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }

bool debug_synchronize{false};
std::size_t internal_max_cached_hist_node{DefaultNodes()};
std::size_t max_cached_hist_node{DefaultNodes()};

void CheckTreesSynchronized(RegTree const* local_tree) const;

Expand All @@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
DMLC_DECLARE_FIELD(debug_synchronize)
.set_default(false)
.describe("Check if all distributed tree are identical after tree construction.");
DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
DMLC_DECLARE_FIELD(max_cached_hist_node)
.set_default(DefaultNodes())
.set_lower_bound(1)
.describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
Expand Down
3 changes: 3 additions & 0 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Approx]: Configure";
hist_maker_param_.UpdateAllowUnknown(args);
if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
}
dh::CheckComputeCapability();
initialised_ = false;

Expand Down
8 changes: 4 additions & 4 deletions tests/cpp/tree/hist/test_evaluate_splits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
row_set_collection.Init();

HistMakerTrainParam hist_param;
hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node);
hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0});
common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);

Expand Down Expand Up @@ -118,7 +118,7 @@ TEST(HistMultiEvaluator, Evaluate) {
linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
for (bst_target_t t{0}; t < n_targets; ++t) {
auto &hist = histogram[t];
hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node);
hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
hist.AllocateHistograms({0});
auto node_hist = hist[0];
node_hist[0] = {-0.5, 0.5};
Expand Down Expand Up @@ -235,7 +235,7 @@ auto CompareOneHotAndPartition(bool onehot) {
entries.front().nid = 0;
entries.front().depth = 0;

hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node);
hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0});
auto node_hist = hist[0];

Expand Down Expand Up @@ -265,7 +265,7 @@ TEST(HistEvaluator, Categorical) {
TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
BoundedHistCollection hist;
HistMakerTrainParam hist_param;
hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist.AllocateHistograms({0});
auto node_hist = hist[0];
ASSERT_EQ(node_hist.size(), feature_histogram_.size());
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/tree/hist/test_histogram.cc
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
Context ctx;
HistMakerTrainParam hist_param;
if (limit) {
hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}});
hist_param.Init(Args{{"max_cached_hist_node", "1"}});
}

std::shared_ptr<DMatrix> Xy =
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/tree/test_evaluate_splits.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
cuts_.min_vals_.Resize(1);

HistMakerTrainParam hist_param;
hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
hist_.AllocateHistograms({0});
auto node_hist = hist_[0];

Expand Down