From 532583315cc5ca417211d4182155e5ee51ebfb06 Mon Sep 17 00:00:00 2001 From: Eric Sheng Date: Tue, 14 May 2024 16:44:32 -0700 Subject: [PATCH] [BACKPORT 2024.1][#21580] docdb: Filter intent SST files only retained for CDC Summary: Original commit: fb7c86c5e37ea7224eddf7edfe8664e662cb00dd / D33131 When CDC is lagging behind, there may be many SST files in intentsdb which only consist of applied transactions, but which we cannot yet delete, since CDC has not streamed the changes yet. These SST files impact performance of reading from intentsdb, even though we don't actually care about them in most cases (since all changes in them have already been applied). This diff adds a hybrid time filter on intent iterators for read path, conflict resolution, and intent apply, to skip all SST files before min running hybrid time. This is gated behind the newly added `docdb_ht_filter_intents` gflag (default on in debug). `docdb_ht_filter_intents` to be set to default on after CDC stress tests with D31900 / 559b2b06244ae9bc27ea5cca1d60fe36e3b8819e changes enabled as well. Jira: DB-10466 Test Plan: Jenkins. Reviewers: sergei, mbautin, rthallam Reviewed By: rthallam Subscribers: rthallam, bogdan, ybase, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D34746 --- src/yb/ash/wait_state.cc | 4 ++-- src/yb/client/ql-stress-test.cc | 4 ++-- src/yb/docdb/conflict_resolution.cc | 21 ++++++++++----------- src/yb/docdb/doc_ql_filefilter.cc | 12 ++++++++++++ src/yb/docdb/doc_ql_filefilter.h | 5 +++++ src/yb/docdb/docdb_rocksdb_util.cc | 22 ++++++++++++++++++++++ src/yb/docdb/docdb_rocksdb_util.h | 7 +++++++ src/yb/docdb/intent_aware_iterator.cc | 16 ++++------------ src/yb/docdb/intent_iterator.cc | 15 +++------------ src/yb/docdb/rocksdb_writer.cc | 10 ++++++++-- src/yb/docdb/rocksdb_writer.h | 2 ++ src/yb/tablet/tablet.cc | 11 ++++++++--- src/yb/tablet/transaction_participant.cc | 6 +++--- src/yb/util/debug-util.h | 9 +-------- src/yb/util/debug.h | 23 +++++++++++++++++++++++ src/yb/util/stack_trace_tracker.cc | 4 ++-- 16 files changed, 114 insertions(+), 57 deletions(-) create mode 100644 src/yb/util/debug.h diff --git a/src/yb/ash/wait_state.cc b/src/yb/ash/wait_state.cc index d281e9c9c5f5..25730f5af409 100644 --- a/src/yb/ash/wait_state.cc +++ b/src/yb/ash/wait_state.cc @@ -53,9 +53,9 @@ DEFINE_RUNTIME_PG_PREVIEW_FLAG(bool, yb_enable_ash, false, "and various background activities. This does nothing if " "ysql_yb_enable_ash_infra is disabled."); -DEFINE_test_flag(bool, export_wait_state_names, yb::IsDebug(), +DEFINE_test_flag(bool, export_wait_state_names, yb::kIsDebug, "Exports wait-state name as a human understandable string."); -DEFINE_test_flag(bool, trace_ash_wait_code_updates, yb::IsDebug(), +DEFINE_test_flag(bool, trace_ash_wait_code_updates, yb::kIsDebug, "Add a trace line whenever the wait state code is updated."); DEFINE_test_flag(uint32, yb_ash_sleep_at_wait_state_ms, 0, "How long to sleep/delay when entering a particular wait state."); diff --git a/src/yb/client/ql-stress-test.cc b/src/yb/client/ql-stress-test.cc index 216e34c54487..c01eea5409a6 100644 --- a/src/yb/client/ql-stress-test.cc +++ b/src/yb/client/ql-stress-test.cc @@ -54,7 +54,7 @@ #include "yb/tserver/ts_tablet_manager.h" #include "yb/util/backoff_waiter.h" -#include "yb/util/debug-util.h" +#include "yb/util/debug.h" #include "yb/util/format.h" #include "yb/util/metrics.h" #include "yb/util/random_util.h" @@ -830,7 +830,7 @@ void QLStressTest::AddWriter( } void QLStressTest::TestWriteRejection() { - constexpr int kWriters = IsDebug() ? 10 : 20; + constexpr int kWriters = kIsDebug ? 10 : 20; constexpr int kKeyBase = 10000; std::array, kWriters> keys; diff --git a/src/yb/docdb/conflict_resolution.cc b/src/yb/docdb/conflict_resolution.cc index f6ab3fea6b6b..92bc12b8ae0b 100644 --- a/src/yb/docdb/conflict_resolution.cc +++ b/src/yb/docdb/conflict_resolution.cc @@ -219,7 +219,9 @@ class ConflictResolver : public std::enable_shared_from_this { // Reads conflicts for specified intent from DB. Status ReadIntentConflicts(IntentTypeSet type, KeyBytes* intent_key_prefix) { - EnsureIntentIteratorCreated(); + if (!CreateIntentIteratorIfNecessary()) { + return Status::OK(); + } const auto conflicting_intent_types = kIntentTypeSetConflicts[type.ToUIntPtr()]; @@ -303,17 +305,12 @@ class ConflictResolver : public std::enable_shared_from_this { return intent_iter_.status(); } - void EnsureIntentIteratorCreated() { + bool CreateIntentIteratorIfNecessary() { if (!intent_iter_.Initialized()) { - intent_iter_ = CreateRocksDBIterator( - doc_db_.intents, - doc_db_.key_bounds, - BloomFilterMode::DONT_USE_BLOOM_FILTER, - boost::none /* user_key_for_filter */, - rocksdb::kDefaultQueryId, - nullptr /* file_filter */, - &intent_key_upperbound_); + intent_iter_ = CreateIntentsIteratorWithHybridTimeFilter( + doc_db_.intents, &status_manager(), doc_db_.key_bounds, &intent_key_upperbound_); } + return intent_iter_.Initialized(); } Result GetLockStatusInfo() { @@ -1147,7 +1144,9 @@ class TransactionConflictResolverContext : public ConflictResolverContextBase { // This is to prevent the case when we create an iterator on the regular DB where a // provisional record has not yet been applied, and then create an iterator the intents // DB where the provisional record has already been removed. - resolver->EnsureIntentIteratorCreated(); + // Even in the case where there are no intents to iterate over, the following loop must be + // run, so we cannot return early if the following call returns false. + resolver->CreateIntentIteratorIfNecessary(); for (const auto& i : container) { const Slice intent_key = i.first.AsSlice(); diff --git a/src/yb/docdb/doc_ql_filefilter.cc b/src/yb/docdb/doc_ql_filefilter.cc index 2c70a4ad175d..f299f6827e7b 100644 --- a/src/yb/docdb/doc_ql_filefilter.cc +++ b/src/yb/docdb/doc_ql_filefilter.cc @@ -21,6 +21,11 @@ #include "yb/rocksdb/db/compaction.h" +#include "yb/util/debug.h" + +DEFINE_RUNTIME_bool(docdb_ht_filter_intents, yb::kIsDebug, + "Use hybrid time SST filter when scanning intents."); + namespace yb::docdb { rocksdb::UserBoundaryTag TagForRangeComponent(size_t index); @@ -147,4 +152,11 @@ std::shared_ptr CreateHybridTimeFileFilter(HybridTime m return std::make_shared(min_hybrid_time); } +std::shared_ptr CreateIntentHybridTimeFileFilter( + HybridTime min_running_ht) { + return GetAtomicFlag(&FLAGS_docdb_ht_filter_intents) && min_running_ht != HybridTime::kMin + ? std::make_shared(min_running_ht) + : nullptr; +} + } // namespace yb::docdb diff --git a/src/yb/docdb/doc_ql_filefilter.h b/src/yb/docdb/doc_ql_filefilter.h index f18c024b28a4..8a358008f89b 100644 --- a/src/yb/docdb/doc_ql_filefilter.h +++ b/src/yb/docdb/doc_ql_filefilter.h @@ -16,6 +16,7 @@ #pragma once #include "yb/common/hybrid_time.h" +#include "yb/common/transaction.h" #include "yb/qlexpr/qlexpr_fwd.h" @@ -25,5 +26,9 @@ namespace yb::docdb { std::shared_ptr CreateFileFilter(const qlexpr::YQLScanSpec& scan_spec); std::shared_ptr CreateHybridTimeFileFilter(HybridTime min_hybrid_Time); +// Create a file filter for intentsdb using the given min running hybrid time. Filtering is done +// based on intent hybrid time stored in the intent key, not commit time of the transaction. +std::shared_ptr CreateIntentHybridTimeFileFilter( + HybridTime min_running_ht); } // namespace yb::docdb diff --git a/src/yb/docdb/docdb_rocksdb_util.cc b/src/yb/docdb/docdb_rocksdb_util.cc index 7cad9b1eda49..e9ed28c6410d 100644 --- a/src/yb/docdb/docdb_rocksdb_util.cc +++ b/src/yb/docdb/docdb_rocksdb_util.cc @@ -316,6 +316,28 @@ unique_ptr CreateIntentAwareIterator( statistics ? statistics->IntentsDBStatistics() : nullptr); } +BoundedRocksDbIterator CreateIntentsIteratorWithHybridTimeFilter( + rocksdb::DB* intentsdb, + const TransactionStatusManager* status_manager, + const KeyBounds* docdb_key_bounds, + const Slice* iterate_upper_bound, + rocksdb::Statistics* statistics) { + auto min_running_ht = status_manager->MinRunningHybridTime(); + if (min_running_ht == HybridTime::kMax) { + VLOG(4) << "No transactions running"; + return {}; + } + return CreateRocksDBIterator( + intentsdb, + docdb_key_bounds, + docdb::BloomFilterMode::DONT_USE_BLOOM_FILTER, + boost::none /* user_key_for_filter */, + rocksdb::kDefaultQueryId, + CreateIntentHybridTimeFileFilter(min_running_ht), + iterate_upper_bound, + statistics); +} + namespace { std::mutex rocksdb_flags_mutex; diff --git a/src/yb/docdb/docdb_rocksdb_util.h b/src/yb/docdb/docdb_rocksdb_util.h index c69df190b950..0ff6eb332ed0 100644 --- a/src/yb/docdb/docdb_rocksdb_util.h +++ b/src/yb/docdb/docdb_rocksdb_util.h @@ -69,6 +69,13 @@ std::unique_ptr CreateIntentAwareIterator( const Slice* iterate_upper_bound = nullptr, const DocDBStatistics* statistics = nullptr); +BoundedRocksDbIterator CreateIntentsIteratorWithHybridTimeFilter( + rocksdb::DB* intentsdb, + const TransactionStatusManager* status_manager, + const KeyBounds* docdb_key_bounds, + const Slice* iterate_upper_bound = nullptr, + rocksdb::Statistics* statistics = nullptr); + std::shared_ptr CreateRocksDBPriorityThreadPoolMetrics( scoped_refptr entity); diff --git a/src/yb/docdb/intent_aware_iterator.cc b/src/yb/docdb/intent_aware_iterator.cc index 8d2c2765803f..2f356b09a213 100644 --- a/src/yb/docdb/intent_aware_iterator.cc +++ b/src/yb/docdb/intent_aware_iterator.cc @@ -19,6 +19,7 @@ #include "yb/common/hybrid_time.h" #include "yb/common/transaction.h" +#include "yb/docdb/doc_ql_filefilter.h" #include "yb/docdb/docdb_fwd.h" #include "yb/docdb/conflict_resolution.h" #include "yb/docdb/docdb-internal.h" @@ -139,18 +140,9 @@ IntentAwareIterator::IntentAwareIterator( << ", txn_op_context: " << txn_op_context_; if (txn_op_context) { - if (txn_op_context.txn_status_manager->MinRunningHybridTime() != HybridTime::kMax) { - intent_iter_ = docdb::CreateRocksDBIterator(doc_db.intents, - doc_db.key_bounds, - docdb::BloomFilterMode::DONT_USE_BLOOM_FILTER, - boost::none, - rocksdb::kDefaultQueryId, - nullptr /* file_filter */, - &intent_upperbound_, - intentsdb_statistics); - } else { - VLOG(4) << "No transactions running"; - } + intent_iter_ = docdb::CreateIntentsIteratorWithHybridTimeFilter( + doc_db.intents, txn_op_context.txn_status_manager, doc_db.key_bounds, &intent_upperbound_, + intentsdb_statistics); } // WARNING: Is is important for regular DB iterator to be created after intents DB iterator, // otherwise consistency could break, for example in following scenario: diff --git a/src/yb/docdb/intent_iterator.cc b/src/yb/docdb/intent_iterator.cc index d57eb9e20a93..e278f624c51c 100644 --- a/src/yb/docdb/intent_iterator.cc +++ b/src/yb/docdb/intent_iterator.cc @@ -14,6 +14,7 @@ #include "yb/docdb/intent_iterator.h" #include "yb/docdb/conflict_resolution.h" +#include "yb/docdb/doc_ql_filefilter.h" #include "yb/docdb/docdb-internal.h" #include "yb/docdb/docdb_rocksdb_util.h" #include "yb/docdb/iter_util.h" @@ -74,18 +75,8 @@ IntentIterator::IntentIterator( VLOG(4) << "IntentIterator, read_time: " << read_time << ", txn_op_context: " << txn_op_context_; if (txn_op_context) { - if (txn_op_context.txn_status_manager->MinRunningHybridTime() != HybridTime::kMax) { - intent_iter_ = docdb::CreateRocksDBIterator( - intents_db, - docdb_key_bounds, - docdb::BloomFilterMode::DONT_USE_BLOOM_FILTER, - boost::none, - rocksdb::kDefaultQueryId, - nullptr /* file_filter */, - &upperbound_); - } else { - VLOG(4) << "No transactions running"; - } + intent_iter_ = docdb::CreateIntentsIteratorWithHybridTimeFilter( + intents_db, txn_op_context.txn_status_manager, docdb_key_bounds, &upperbound_); } VTRACE(2, "Created intent iterator - initialized? - $0", intent_iter_.Initialized()); } diff --git a/src/yb/docdb/rocksdb_writer.cc b/src/yb/docdb/rocksdb_writer.cc index 212185d28aa1..913ee1eb3881 100644 --- a/src/yb/docdb/rocksdb_writer.cc +++ b/src/yb/docdb/rocksdb_writer.cc @@ -16,6 +16,7 @@ #include "yb/common/row_mark.h" #include "yb/docdb/conflict_resolution.h" +#include "yb/docdb/doc_ql_filefilter.h" #include "yb/docdb/docdb.messages.h" #include "yb/docdb/docdb_compaction_context.h" #include "yb/docdb/docdb_rocksdb_util.h" @@ -437,15 +438,18 @@ IntentsWriterContext::IntentsWriterContext(const TransactionId& transaction_id) } IntentsWriter::IntentsWriter(const Slice& start_key, + HybridTime file_filter_ht, rocksdb::DB* intents_db, IntentsWriterContext* context) : start_key_(start_key), intents_db_(intents_db), context_(*context) { AppendTransactionKeyPrefix(context_.transaction_id(), &txn_reverse_index_prefix_); txn_reverse_index_prefix_.AppendKeyEntryType(dockv::KeyEntryType::kMaxByte); reverse_index_upperbound_ = txn_reverse_index_prefix_.AsSlice(); + reverse_index_iter_ = CreateRocksDBIterator( intents_db_, &KeyBounds::kNoBounds, BloomFilterMode::DONT_USE_BLOOM_FILTER, boost::none, - rocksdb::kDefaultQueryId, nullptr /* read_filter */, &reverse_index_upperbound_); + rocksdb::kDefaultQueryId, CreateIntentHybridTimeFileFilter(file_filter_ht), + &reverse_index_upperbound_); } Status IntentsWriter::Apply(rocksdb::DirectWriteHandler* handler) { @@ -502,6 +506,7 @@ ApplyIntentsContext::ApplyIntentsContext( const SubtxnSet& aborted, HybridTime commit_ht, HybridTime log_ht, + HybridTime file_filter_ht, const KeyBounds* key_bounds, SchemaPackingProvider* schema_packing_provider, rocksdb::DB* intents_db) @@ -519,7 +524,8 @@ ApplyIntentsContext::ApplyIntentsContext( key_bounds_(key_bounds), intent_iter_(CreateRocksDBIterator( intents_db, key_bounds, BloomFilterMode::DONT_USE_BLOOM_FILTER, boost::none, - rocksdb::kDefaultQueryId)) { + rocksdb::kDefaultQueryId, + CreateIntentHybridTimeFileFilter(file_filter_ht))) { } Result ApplyIntentsContext::StoreApplyState( diff --git a/src/yb/docdb/rocksdb_writer.h b/src/yb/docdb/rocksdb_writer.h index 0293db5c4c28..5f555f7af435 100644 --- a/src/yb/docdb/rocksdb_writer.h +++ b/src/yb/docdb/rocksdb_writer.h @@ -177,6 +177,7 @@ class IntentsWriterContext { class IntentsWriter : public rocksdb::DirectWriter { public: IntentsWriter(const Slice& start_key, + HybridTime file_filter_ht, rocksdb::DB* intents_db, IntentsWriterContext* context); @@ -219,6 +220,7 @@ class ApplyIntentsContext : public IntentsWriterContext, public FrontierSchemaVe const SubtxnSet& aborted, HybridTime commit_ht, HybridTime log_ht, + HybridTime file_filter_ht, const KeyBounds* key_bounds, SchemaPackingProvider* schema_packing_provider, rocksdb::DB* intents_db); diff --git a/src/yb/tablet/tablet.cc b/src/yb/tablet/tablet.cc index ad1b7f637bef..723d4d8e20b1 100644 --- a/src/yb/tablet/tablet.cc +++ b/src/yb/tablet/tablet.cc @@ -2025,6 +2025,8 @@ Status Tablet::ImportData(const std::string& source_dir) { Result Tablet::ApplyIntents(const TransactionApplyData& data) { VLOG_WITH_PREFIX(4) << __func__ << ": " << data.transaction_id; + HybridTime min_running_ht = transaction_participant_->MinRunningHybridTime(); + // This flag enables tests to induce a situation where a transaction has committed but its intents // haven't yet moved to regular db for a sufficiently long period. For example, it can help a test // to reliably assert that conflict resolution/ concurrency control with a conflicting committed @@ -2033,9 +2035,10 @@ Result Tablet::ApplyIntents(const TransactionApply AtomicFlagSleepMs(&FLAGS_TEST_inject_sleep_before_applying_intents_ms); docdb::ApplyIntentsContext context( data.transaction_id, data.apply_state, data.aborted, data.commit_ht, data.log_ht, - &key_bounds_, metadata_.get(), intents_db_.get()); + min_running_ht, &key_bounds_, metadata_.get(), intents_db_.get()); docdb::IntentsWriter intents_writer( - data.apply_state ? data.apply_state->key : Slice(), intents_db_.get(), &context); + data.apply_state ? data.apply_state->key : Slice(), min_running_ht, + intents_db_.get(), &context); rocksdb::WriteBatch regular_write_batch; regular_write_batch.SetDirectWriter(&intents_writer); // data.hybrid_time contains transaction commit time. @@ -2057,12 +2060,14 @@ Status Tablet::RemoveIntentsImpl( RETURN_NOT_OK(scoped_read_operation); rocksdb::WriteBatch intents_write_batch; + HybridTime min_running_ht = CHECK_NOTNULL(transaction_participant_)->MinRunningHybridTime(); for (const auto& id : ids) { boost::optional apply_state; for (;;) { docdb::RemoveIntentsContext context(id, static_cast(reason)); docdb::IntentsWriter writer( - apply_state ? apply_state->key : Slice(), intents_db_.get(), &context); + apply_state ? apply_state->key : Slice(), min_running_ht, + intents_db_.get(), &context); intents_write_batch.SetDirectWriter(&writer); docdb::ConsensusFrontiers frontiers; auto frontiers_ptr = InitFrontiers(data, &frontiers); diff --git a/src/yb/tablet/transaction_participant.cc b/src/yb/tablet/transaction_participant.cc index 336e3cba874b..8a49540a6a3c 100644 --- a/src/yb/tablet/transaction_participant.cc +++ b/src/yb/tablet/transaction_participant.cc @@ -52,7 +52,7 @@ #include "yb/util/async_util.h" #include "yb/util/callsite_profiling.h" #include "yb/util/countdown_latch.h" -#include "yb/util/debug-util.h" +#include "yb/util/debug.h" #include "yb/util/flags.h" #include "yb/util/format.h" #include "yb/util/logging.h" @@ -105,11 +105,11 @@ DEFINE_NON_RUNTIME_int32(wait_queue_poll_interval_ms, 100, "active blockers."); // TODO: this should be turned into an autoflag. -DEFINE_RUNTIME_bool(cdc_write_post_apply_metadata, yb::IsDebug(), +DEFINE_RUNTIME_bool(cdc_write_post_apply_metadata, yb::kIsDebug, "Write post-apply transaction metadata to intentsdb for transaction that have been applied but " " have not yet been streamed by CDC."); -DEFINE_RUNTIME_bool(cdc_immediate_transaction_cleanup, yb::IsDebug(), +DEFINE_RUNTIME_bool(cdc_immediate_transaction_cleanup, yb::kIsDebug, "Clean up transactions from memory after apply, even if its changes have not yet been " "streamed by CDC."); diff --git a/src/yb/util/debug-util.h b/src/yb/util/debug-util.h index 95fbcc9928a3..c1f19b54c89c 100644 --- a/src/yb/util/debug-util.h +++ b/src/yb/util/debug-util.h @@ -39,6 +39,7 @@ #include "yb/gutil/strings/fastmem.h" +#include "yb/util/debug.h" #include "yb/util/enums.h" #include "yb/util/slice.h" #include "yb/util/stack_trace.h" @@ -103,14 +104,6 @@ std::string GetLogFormatStackTraceHex(); // may invoke the dynamic loader. void HexStackTraceToString(char* buf, size_t size); -constexpr bool IsDebug() { -#ifdef NDEBUG - return false; -#else - return true; -#endif -} - class NODISCARD_CLASS ScopeLogger { public: ScopeLogger(const std::string& msg, std::function on_scope_bounds); diff --git a/src/yb/util/debug.h b/src/yb/util/debug.h new file mode 100644 index 000000000000..09f366d2a4e4 --- /dev/null +++ b/src/yb/util/debug.h @@ -0,0 +1,23 @@ +// Copyright (c) YugaByte, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// +#pragma once + +namespace yb { + +#ifdef NDEBUG +constexpr bool kIsDebug = false; +#else +constexpr bool kIsDebug = true; +#endif + +} // namespace yb diff --git a/src/yb/util/stack_trace_tracker.cc b/src/yb/util/stack_trace_tracker.cc index c21b771db21c..70233216c797 100644 --- a/src/yb/util/stack_trace_tracker.cc +++ b/src/yb/util/stack_trace_tracker.cc @@ -20,12 +20,12 @@ #include "yb/gutil/thread_annotations.h" #include "yb/util/atomic.h" -#include "yb/util/debug-util.h" +#include "yb/util/debug.h" #include "yb/util/flags.h" #include "yb/util/stack_trace.h" #include "yb/util/unique_lock.h" -DEFINE_RUNTIME_bool(track_stack_traces, yb::IsDebug(), "Whether to enable stack trace tracking"); +DEFINE_RUNTIME_bool(track_stack_traces, yb::kIsDebug, "Whether to enable stack trace tracking"); namespace yb {