From 61809fe5d6c35d47b9991e5e2e245348acc62ccb Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 25 May 2022 12:43:48 -0700 Subject: [PATCH] FindObsoleteFiles() to directly check whether candidate files are live (#10040) Summary: Right now, in FindObsoleteFiles() we build a list of all live SST files from all existing Versions. This is all done in DB mutex, and is O(m*n) where m is number of versions and n is number of files. In some extereme cases, it can take very long. The list is used to see whether a candidate file still shows up in a version. With this commit, every candidate file is directly check against all the versions for file existance. This operation would be O(m*k) where k is number of candidate files. Since is usually small (except perhaps full compaction in universal compaction), this is usually much faster than the previous solution. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10040 Test Plan: TBD Reviewed By: riversand963 Differential Revision: D36613391 fbshipit-source-id: 3f13b090f755d9b3ae417faec62cd6e798bac1eb Signed-off-by: tabokie --- HISTORY.md | 3 + db/db_impl/db_impl_files.cc | 16 ++++- db/version_set.cc | 52 ++++++++++++++++ db/version_set.h | 117 +++++++++++++++++++++++------------- 4 files changed, 142 insertions(+), 46 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 523ef4b8113..8964e1eca25 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,9 @@ * Remove DBOptions::preserved_deletes and DB::SetPreserveDeletesSequenceNumber(). * Removed timestamp from WriteOptions. Accordingly, added to DB APIs Put, Delete, SingleDelete, etc. accepting an additional argument 'timestamp'. Added Put, Delete, SingleDelete, etc to WriteBatch accepting an additional argument 'timestamp'. Removed WriteBatch::AssignTimestamps(vector) API. Renamed WriteBatch::AssignTimestamp() to WriteBatch::UpdateTimestamps() with clarified comments. +### Performance Improvements +* Reduce DB mutex holding time when finding obsolete files to delete. When a file is trivial moved to another level, the internal files will be referenced twice internally and sometimes opened twice too. If a deletion candidate file is not the last reference, we need to destroy the reference and close the file but not deleting the file. Right now we determine it by building a set of all live files. With the improvement, we check the file against all live LSM-tree versions instead. + ## New Features * Improved the SstDumpTool to read the comparator from table properties and use it to read the SST File. diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 1790ed836f4..b6dffb157d8 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -166,8 +166,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, job_context->log_number = MinLogNumberToKeep(); job_context->prev_log_number = versions_->prev_log_number(); - versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live); if (doing_the_full_scan) { + versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live); InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), dbname_); std::set paths; @@ -242,6 +242,14 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, log_file, immutable_db_options_.db_log_dir); } } + } else { + // Instead of filling ob_context->sst_live and job_context->blob_live, + // directly remove files that show up in any Version. This is because + // candidate files tend to be a small percentage of all files, so it is + // usually cheaper to check them against every version, compared to + // building a map for all files. + versions_->RemoveLiveFiles(job_context->sst_delete_files, + job_context->blob_delete_files); } // logs_ is empty when called during recovery, in which case there can't yet @@ -395,8 +403,10 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. for (auto& file : state.sst_delete_files) { - candidate_files.emplace_back( - MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + if (!file.only_delete_metadata) { + candidate_files.emplace_back( + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); + } if (file.metadata->table_reader_handle) { table_cache_->Release(file.metadata->table_reader_handle); } diff --git a/db/version_set.cc b/db/version_set.cc index 53239d8b267..7feac9767f7 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3913,6 +3913,26 @@ void Version::AddLiveFiles(std::vector* live_table_files, } } +void Version::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + for (ObsoleteFileInfo& fi : sst_delete_candidates) { + if (!fi.only_delete_metadata && + storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) != + VersionStorageInfo::FileLocation::Invalid()) { + fi.only_delete_metadata = true; + } + } + + blob_delete_candidates.erase( + std::remove_if( + blob_delete_candidates.begin(), blob_delete_candidates.end(), + [this](ObsoleteBlobFileInfo& x) { + return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber()); + }), + blob_delete_candidates.end()); +} + std::string Version::DebugString(bool hex, bool print_stats) const { std::string r; for (int level = 0; level < storage_info_.num_levels_; level++) { @@ -5674,6 +5694,38 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, v->GetMutableCFOptions().prefix_extractor); } +void VersionSet::RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const { + assert(column_family_set_); + for (auto cfd : *column_family_set_) { + assert(cfd); + if (!cfd->initialized()) { + continue; + } + + auto* current = cfd->current(); + bool found_current = false; + + Version* const dummy_versions = cfd->dummy_versions(); + assert(dummy_versions); + + for (Version* v = dummy_versions->next_; v != dummy_versions; + v = v->next_) { + v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + if (v == current) { + found_current = true; + } + } + + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates); + } + } +} + void VersionSet::AddLiveFiles(std::vector* live_table_files, std::vector* live_blob_files) const { assert(live_table_files); diff --git a/db/version_set.h b/db/version_set.h index 7c64d67b708..223c8042520 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -354,6 +354,21 @@ class VersionStorageInfo { using BlobFiles = std::map>; const BlobFiles& GetBlobFiles() const { return blob_files_; } + // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) + std::shared_ptr GetBlobFileMetaData( + uint64_t blob_file_number) const { + const auto it = blob_files_.find(blob_file_number); + + if (it == blob_files_.end()) { + return nullptr; + } + + const auto& meta = it->second; + assert(meta); + + return meta; + } + uint64_t GetTotalBlobFileSize() const { uint64_t total_blob_bytes = 0; @@ -658,6 +673,55 @@ class VersionStorageInfo { friend class VersionSet; }; +struct ObsoleteFileInfo { + FileMetaData* metadata; + std::string path; + // If true, the FileMataData should be destroyed but the file should + // not be deleted. This is because another FileMetaData still references + // the file, usually because the file is trivial moved so two FileMetadata + // is managing the file. + bool only_delete_metadata = false; + + ObsoleteFileInfo() noexcept + : metadata(nullptr), only_delete_metadata(false) {} + ObsoleteFileInfo(FileMetaData* f, const std::string& file_path) + : metadata(f), path(file_path), only_delete_metadata(false) {} + + ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; + ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; + + ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() { + *this = std::move(rhs); + } + + ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { + path = std::move(rhs.path); + metadata = rhs.metadata; + rhs.metadata = nullptr; + only_delete_metadata = rhs.only_delete_metadata; + rhs.only_delete_metadata = false; + + return *this; + } + void DeleteMetadata() { + delete metadata; + metadata = nullptr; + } +}; + +class ObsoleteBlobFileInfo { + public: + ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) + : blob_file_number_(blob_file_number), path_(std::move(path)) {} + + uint64_t GetBlobFileNumber() const { return blob_file_number_; } + const std::string& GetPath() const { return path_; } + + private: + uint64_t blob_file_number_; + std::string path_; +}; + using MultiGetRange = MultiGetContext::Range; // A column family's version consists of the table and blob files owned by // the column family at a certain point in time. @@ -759,6 +823,11 @@ class Version { void AddLiveFiles(std::vector* live_table_files, std::vector* live_blob_files) const; + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false, bool print_stats = false) const; @@ -888,49 +957,6 @@ class Version { void operator=(const Version&) = delete; }; -struct ObsoleteFileInfo { - FileMetaData* metadata; - std::string path; - - ObsoleteFileInfo() noexcept : metadata(nullptr) {} - ObsoleteFileInfo(FileMetaData* f, const std::string& file_path) - : metadata(f), path(file_path) {} - - ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; - ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; - - ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : - ObsoleteFileInfo() { - *this = std::move(rhs); - } - - ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { - path = std::move(rhs.path); - metadata = rhs.metadata; - rhs.metadata = nullptr; - - return *this; - } - - void DeleteMetadata() { - delete metadata; - metadata = nullptr; - } -}; - -class ObsoleteBlobFileInfo { - public: - ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) - : blob_file_number_(blob_file_number), path_(std::move(path)) {} - - uint64_t GetBlobFileNumber() const { return blob_file_number_; } - const std::string& GetPath() const { return path_; } - - private: - uint64_t blob_file_number_; - std::string path_; -}; - class BaseReferencedVersionBuilder; class AtomicGroupReadBuffer { @@ -1245,6 +1271,11 @@ class VersionSet { void AddLiveFiles(std::vector* live_table_files, std::vector* live_blob_files) const; + // Remove live files that are in the delete candidate lists. + void RemoveLiveFiles( + std::vector& sst_delete_candidates, + std::vector& blob_delete_candidates) const; + // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels