diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py index 2f79111867f8..fa54d184814a 100644 --- a/demo/guide-python/external_memory.py +++ b/demo/guide-python/external_memory.py @@ -75,21 +75,22 @@ def reset(self) -> None: def main(tmpdir: str) -> xgboost.Booster: # generate some random data for demo - files = make_batches(2 ** 16, 17, 31, tmpdir) + files = make_batches(1024, 17, 31, tmpdir) it = Iterator(files) # For non-data arguments, specify it here once instead of passing them by the `next` # method. missing = np.NaN Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False) - # Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in - # doc for details. + # Other tree methods including ``approx``, ``hist``, and ``gpu_hist`` are supported, + # see tutorial in doc for details. booster = xgboost.train( - {"tree_method": "gpu_hist", "max_depth": 6, "sampling_method": "gradient_based", "subsample": 0.5}, + {"tree_method": "hist", "max_depth": 4}, Xy, evals=[(Xy, "Train")], - num_boost_round=2, + num_boost_round=10, ) + return booster if __name__ == "__main__": diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst index bfa173384c8d..31d2cf8657ac 100644 --- a/doc/tutorials/external_memory.rst +++ b/doc/tutorials/external_memory.rst @@ -10,8 +10,9 @@ not supported by ``exact`` tree method. .. warning:: - The implementation of external memory uses ``mmap`` and is not tested against errors - like disconnected network devices. (`SIGBUS`) + The implementation of external memory uses ``mmap`` and is not tested against system + errors like disconnected network devices (`SIGBUS`). In addition, Windows is not yet + supported. .. note:: diff --git a/src/common/io.cc b/src/common/io.cc index a560c94a1d89..f9756c63c42d 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -170,8 +170,7 @@ void* PrivateMmapStream::Open(StringView path, bool read_only, std::size_t offse #if defined(__linux__) || defined(__GLIBC__) ptr = reinterpret_cast(mmap64(nullptr, length, prot, MAP_PRIVATE, fd_, offset)); #elif defined(_MSC_VER) - // fixme: not yet implemented - ptr = reinterpret_cast(mmap(nullptr, length, prot, MAP_PRIVATE, fd_, offset)); + LOG(FATAL) << "External memory is not implemented for Windows."; #else CHECK_LE(offset, std::numeric_limits::max()) << "File size has exceeded the limit on the current system."; diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc new file mode 100644 index 000000000000..0cc34900f067 --- /dev/null +++ b/src/data/sparse_page_source.cc @@ -0,0 +1,20 @@ +/** + * Copyright 2023, XGBoost Contributors + */ +#include "sparse_page_source.h" + +#include // for getpagesize + +namespace xgboost::data { +std::size_t PadPageForMMAP(std::size_t file_bytes, dmlc::Stream* fo) { + decltype(file_bytes) page_size = getpagesize(); + CHECK(page_size != 0 && page_size % 2 == 0) << "Failed to get page size on the current system."; + CHECK_NE(file_bytes, 0) << "Empty page encountered."; + auto n = file_bytes / page_size; + auto padded = (n + !!(file_bytes % page_size != 0)) * page_size; + auto padding = padded - file_bytes; + std::vector padding_bytes(padding, 0); + fo->Write(padding_bytes.data(), padding_bytes.size()); + return padded; +} +} // namespace xgboost::data diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index b07178c496f3..e27c9e918bdc 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -5,8 +5,6 @@ #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_ -#include // for getpagesize - #include // for min #include #include @@ -34,6 +32,16 @@ inline void TryDeleteCacheFile(const std::string& file) { } } +/** + * @brief Pad the output file for a page to make it mmap compatible. + * + * @param file_bytes The size of the output file + * @param fo Stream used to write the file. + * + * @return The file size after being padded. + */ +std::size_t PadPageForMMAP(std::size_t file_bytes, dmlc::Stream* fo); + struct Cache { // whether the write to the cache is complete bool written; @@ -41,7 +49,6 @@ struct Cache { std::string format; // offset into binary cache file. std::vector offset; - std::vector bytes; Cache(bool w, std::string n, std::string fmt) : written{w}, name{std::move(n)}, format{std::move(fmt)} { @@ -57,7 +64,6 @@ struct Cache { return ShardName(this->name, this->format); } void Push(std::size_t n_bytes) { - bytes.push_back(n_bytes); offset.push_back(n_bytes); } @@ -139,7 +145,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl { auto n = self->cache_info_->ShardName(); std::uint64_t offset = self->cache_info_->offset.at(fetch_it); - std::uint64_t length = self->cache_info_->bytes.at(fetch_it); + std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset; auto fi = std::make_unique(n, true, offset, length); CHECK(fmt->Read(page.get(), fi.get())); @@ -151,6 +157,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl { n_prefetch_batches) << "Sparse DMatrix assumes forward iteration."; page_ = (*ring_)[count_].get(); + CHECK(!(*ring_)[count_].valid()); return true; } @@ -169,18 +176,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl { } auto bytes = fmt->Write(*page_, fo.get()); - - // align for mmap - decltype(bytes) page_size = getpagesize(); - CHECK(page_size != 0 && page_size % 2 == 0) << "Failed to get page size on the current system."; - auto n = bytes / page_size; - auto padded = (n + 1) * page_size; - auto padding = padded - bytes; - std::vector padding_bytes(padding, 0); - fo->Write(padding_bytes.data(), padding_bytes.size()); + auto padded = PadPageForMMAP(bytes, fo.get()); timer.Stop(); - LOG(INFO) << static_cast(bytes) / 1024.0 / 1024.0 << " MB written in " << timer.ElapsedSeconds() << " seconds."; cache_info_->Push(padded); @@ -280,6 +278,7 @@ class SparsePageSource : public SparsePageSourceImpl { } if (at_end_) { + CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1); cache_info_->Commit(); if (n_batches_ != 0) { CHECK_EQ(count_, n_batches_);