Skip to content

Commit

Permalink
chore: change how we track memory_budget during evictions (#3457)
Browse files Browse the repository at this point in the history
* chore: change how we track memory_budget during evictions

We compared memory_budget vs 0 before inserting a new item in DbSlice,
and retired cool pages if we are low on memory.

The problem - when we decide whether we allow growing a table, we estimate the possible object size increase due to the future table growth.
And the memory check described before was not consistent with the actual logic that rejected the insertion.

Moreover, the memory_budget tracking interaction with EvictionPolicy was over-complicated: we passed the memory_budget counter to the evp object
and then read it back, even though evp did not track object deletions memory impact during objects evictions.

Now, we remove the responsibility from evp to update memory_budget_ so it's solely updated by DbSlice.
We also update memory_budget_ during deletions, and when we pass it to evp, we add cool memory size as potential memory resource to avoid
rejections in case we have lots of cool memory.

Fixes #3456
---------

Signed-off-by: Roman Gershman <[email protected]>
  • Loading branch information
romange authored Aug 7, 2024
1 parent 41be819 commit 7c84b8e
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 42 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ jobs:
df -h
echo "-----------------------------"
ninja src/all
- name: PostFail
if: failure()
run: |
Expand Down Expand Up @@ -155,7 +156,7 @@ jobs:
timeout 5m ./multi_test --multi_exec_mode=1
timeout 5m ./multi_test --multi_exec_mode=3
timeout 5m ./json_family_test --jsonpathv2=false
timeout 5m ./tiered_storage_test --vmodule=db_slice=2 --logtostderr
- name: Upload unit logs on failure
if: failure()
uses: actions/upload-artifact@v4
Expand Down
85 changes: 47 additions & 38 deletions src/server/db_slice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,12 @@ class PrimeEvictionPolicy {
static constexpr bool can_evict = true; // we implement eviction functionality.
static constexpr bool can_gc = true;

PrimeEvictionPolicy(const DbContext& cntx, bool can_evict, ssize_t mem_budget, ssize_t soft_limit,
// mem_offset - memory_offset that we should account for in addition to DbSlice::memory_budget.
// May be negative.
PrimeEvictionPolicy(const DbContext& cntx, bool can_evict, ssize_t mem_offset, ssize_t soft_limit,
DbSlice* db_slice, bool apply_memory_limit)
: db_slice_(db_slice),
mem_budget_(mem_budget),
mem_offset_(mem_offset),
soft_limit_(soft_limit),
cntx_(cntx),
can_evict_(can_evict),
Expand All @@ -85,7 +87,6 @@ class PrimeEvictionPolicy {

// A hook function that is called every time a segment is full and requires splitting.
void RecordSplit(PrimeTable::Segment_t* segment) {
mem_budget_ -= PrimeTable::kSegBytes;
DVLOG(2) << "split: " << segment->SlowSize() << "/" << segment->capacity();
}

Expand All @@ -94,10 +95,6 @@ class PrimeEvictionPolicy {
unsigned GarbageCollect(const PrimeTable::HotspotBuckets& eb, PrimeTable* me);
unsigned Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me);

ssize_t mem_budget() const {
return mem_budget_;
}

unsigned evicted() const {
return evicted_;
}
Expand All @@ -108,7 +105,7 @@ class PrimeEvictionPolicy {

private:
DbSlice* db_slice_;
ssize_t mem_budget_;
ssize_t mem_offset_;
ssize_t soft_limit_ = 0;
const DbContext cntx_;

Expand Down Expand Up @@ -136,7 +133,8 @@ class PrimeBumpPolicy {
};

bool PrimeEvictionPolicy::CanGrow(const PrimeTable& tbl) const {
if (!apply_memory_limit_ || mem_budget_ > soft_limit_)
ssize_t mem_available = db_slice_->memory_budget() + mem_offset_;
if (!apply_memory_limit_ || mem_available > soft_limit_)
return true;

DCHECK_LE(tbl.size(), tbl.capacity());
Expand All @@ -145,11 +143,11 @@ bool PrimeEvictionPolicy::CanGrow(const PrimeTable& tbl) const {
// we estimate how much memory we will take with the current capacity
// even though we may currently use less memory.
// see https://github.com/dragonflydb/dragonfly/issues/256#issuecomment-1227095503
size_t new_available = (tbl.capacity() - tbl.size()) + PrimeTable::kSegCapacity;
bool res =
mem_budget_ > int64_t(PrimeTable::kSegBytes + db_slice_->bytes_per_object() * new_available *
GetFlag(FLAGS_table_growth_margin));
VLOG(2) << "available: " << new_available << ", res: " << res;
size_t table_free_items = (tbl.capacity() - tbl.size()) + PrimeTable::kSegCapacity;
size_t obj_bytes_estimation =
db_slice_->bytes_per_object() * table_free_items * GetFlag(FLAGS_table_growth_margin);
bool res = mem_available > int64_t(PrimeTable::kSegBytes + obj_bytes_estimation);
VLOG(2) << "available: " << table_free_items << ", res: " << res;

return res;
}
Expand Down Expand Up @@ -575,12 +573,21 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
// It's a new entry.
CallChangeCallbacks(cntx.db_index, key, {key});

ssize_t memory_offset = -key.size();

// If we are low on memory due to cold storage, free some memory.
if (memory_budget_ < ssize_t(key.size()) && owner_->tiered_storage()) {
if (owner_->tiered_storage()) {
// At least 40KB bytes to cover potential segment split.
size_t goal = std::max<size_t>(key.size() * 2 - memory_budget_, 40_KB);
size_t reclaimed = owner_->tiered_storage()->ReclaimMemory(goal);
memory_budget_ += reclaimed;
ssize_t red_line = std::max<size_t>(key.size() * 2, 40_KB);
if (memory_budget_ < red_line) {
size_t goal = red_line - memory_budget_;
size_t reclaimed = owner_->tiered_storage()->ReclaimMemory(goal);
memory_budget_ += reclaimed;
}

// CoolMemoryUsage is the memory that we can always reclaim, like in the block above,
// therefore we include it for PrimeEvictionPolicy considerations.
memory_offset += owner_->tiered_storage()->CoolMemoryUsage();
}

// In case we are loading from rdb file or replicating we want to disable conservative memory
Expand All @@ -594,37 +601,36 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
bool apply_memory_limit =
!owner_->IsReplica() && !(ServerState::tlocal()->gstate() == GlobalState::LOADING);

PrimeEvictionPolicy evp{cntx,
(bool(caching_mode_) && !owner_->IsReplica()),
int64_t(memory_budget_ - key.size()),
ssize_t(soft_budget_limit_),
this,
apply_memory_limit};

// If we are over limit in non-cache scenario, just be conservative and throw.
if (apply_memory_limit && !caching_mode_ && evp.mem_budget() < 0) {
VLOG(2) << "AddOrFind: over limit, budget: " << evp.mem_budget();
if (apply_memory_limit && !caching_mode_ && memory_budget_ + memory_offset < 0) {
VLOG(2) << "AddOrFind: over limit, budget: " << memory_budget_;
events_.insertion_rejections++;
return OpStatus::OUT_OF_MEMORY;
}

PrimeEvictionPolicy evp{cntx, (bool(caching_mode_) && !owner_->IsReplica()),
memory_offset, ssize_t(soft_budget_limit_),
this, apply_memory_limit};

// Fast-path if change_cb_ is empty so we Find or Add using
// the insert operation: twice more efficient.
CompactObj co_key{key};
PrimeIterator it;

size_t table_before = db.table_memory();
ssize_t table_before = db.prime.mem_usage();

try {
it = db.prime.InsertNew(std::move(co_key), PrimeValue{}, evp);
} catch (bad_alloc& e) {
VLOG(2) << "AddOrFind2: bad alloc exception, budget: " << evp.mem_budget();
VLOG(2) << "AddOrFind: bad alloc exception, budget: " << memory_budget_ + memory_offset;
events_.insertion_rejections++;
return OpStatus::OUT_OF_MEMORY;
}

size_t evicted_obj_bytes = 0;
if (evp.mem_budget() < 0 && apply_memory_limit) {
ssize_t table_increase = db.prime.mem_usage() - table_before;
memory_budget_ -= table_increase;

if (memory_budget_ < 0 && apply_memory_limit) {
// We may reach the state when our memory usage is below the limit even if we
// do not add new segments. For example, we have half full segments
// and we add new objects or update the existing ones and our memory usage grows.
Expand All @@ -636,12 +642,11 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
#if 0
size_t evict_goal = std::max<size_t>(512, (-evp.mem_budget()) / 32);
auto [items, bytes] = FreeMemWithEvictionStep(cntx.db_index, it.segment_id(), evict_goal);
evicted_obj_bytes = bytes;
events_.hard_evictions += items;
#endif
}

table_memory_ += (db.table_memory() - table_before);
table_memory_ += table_increase;
entries_count_++;

db.stats.inline_keys += it->first.IsInline();
Expand All @@ -654,8 +659,6 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
events_.stash_unloaded = db.prime.stash_unloaded();
events_.evicted_keys += evp.evicted();
events_.garbage_checked += evp.checked();

memory_budget_ = evp.mem_budget() + evicted_obj_bytes;
if (cluster::IsClusterEnabled()) {
cluster::SlotId sid = cluster::KeySlot(key);
db.slots_stats[sid].key_count += 1;
Expand Down Expand Up @@ -1470,9 +1473,9 @@ void DbSlice::PerformDeletion(Iterator del_it, ExpIterator exp_it, DbTable* tabl
shard_owner()->tiered_storage()->Delete(table->index, &del_it->second);
}

size_t value_heap_size = pv.MallocUsed();
ssize_t value_heap_size = pv.MallocUsed(), key_size_used = del_it->first.MallocUsed();
stats.inline_keys -= del_it->first.IsInline();
AccountObjectMemory(del_it.key(), del_it->first.ObjType(), -del_it->first.MallocUsed(),
AccountObjectMemory(del_it.key(), del_it->first.ObjType(), -key_size_used,
table); // Key
AccountObjectMemory(del_it.key(), pv.ObjType(), -value_heap_size, table); // Value
if (pv.ObjType() == OBJ_HASH && pv.Encoding() == kEncodingListPack) {
Expand All @@ -1487,8 +1490,14 @@ void DbSlice::PerformDeletion(Iterator del_it, ExpIterator exp_it, DbTable* tabl
}

table->prime.Erase(del_it.GetInnerIt());
table_memory_ += (table->table_memory() - table_before);

// Note, currently we do not shrink our tables upon deletion.
// This DCHECK ensures that if we decide to do so, we will have to update table_memory_
// accordingly.
DCHECK_EQ(table->table_memory(), table_before);

--entries_count_;
memory_budget_ += (value_heap_size + key_size_used);

SendInvalidationTrackingMessage(del_it.key());
}
Expand Down
2 changes: 1 addition & 1 deletion src/server/db_slice.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ class DbSlice {
bool expire_allowed_ = true;

uint64_t version_ = 1; // Used to version entries in the PrimeTable.
ssize_t memory_budget_ = SSIZE_MAX;
ssize_t memory_budget_ = SSIZE_MAX / 2;
size_t bytes_per_object_ = 0;
size_t soft_budget_limit_ = 0;
size_t table_memory_ = 0;
Expand Down
3 changes: 1 addition & 2 deletions src/server/tiered_storage_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,7 @@ TEST_F(TieredStorageTest, MemoryPressure) {
resp = Run({"INFO", "ALL"});
ASSERT_FALSE(true) << i << "\nInfo ALL:\n" << resp.GetString();
}
// TODO: to remove it once used_mem_current is updated frequently.
ThisFiber::SleepFor(300us);
ThisFiber::SleepFor(100us);
}

EXPECT_LT(used_mem_peak.load(), 20_MB);
Expand Down

0 comments on commit 7c84b8e

Please sign in to comment.