Skip to content

Commit

Permalink
KUDU-2001 Refactor rowset size estimates
Browse files Browse the repository at this point in the history
Currently, Rowset::EstimateOnDiskSize() serves two purposes:
1. An estimate of the total size of the rowset, which is
exposed when rolled into the tablet's on-disk size metric.
2. An estimate of the benefit of compaction.
These two purposes conflicted-- the compaction size counts only
base data and redo deltas that are relevant for compaction, so
e.g. undo deltas are omitted from the estimate.

This patch separates these two purposes. EstimateOnDiskSize()
remains the method for purpose #1, while a new method
EstimateCompactionSize() is introduced for purpose #2.
EstimateOnDiskSize now includes undo deltas, and so is more
accurate than before (however, there's more work to do: see
KUDU-1755).

There should be no changes to compaction policy as a result of
this patch.

Change-Id: I59001adadb9a768a464e7b2cf0f0a5df0ef5393a
Reviewed-on: http://gerrit.cloudera.org:8080/6850
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <[email protected]>
  • Loading branch information
Will Berkeley authored and wdberkeley committed May 18, 2017
1 parent 068767b commit 1aa0ebb
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/kudu/tablet/compaction_policy-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ TEST(TestCompactionPolicy, TestYcsbCompaction) {
LOG(INFO) << "quality=" << quality;
int total_size = 0;
for (const auto* rs : picked) {
total_size += rs->EstimateOnDiskSize() / 1024 / 1024;
total_size += rs->EstimateCompactionSize() / 1024 / 1024;
}
ASSERT_LE(total_size, budget_mb);
qualities.push_back(quality);
Expand Down
12 changes: 12 additions & 0 deletions src/kudu/tablet/delta_tracker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,18 @@ size_t DeltaTracker::CountRedoDeltaStores() const {
}

uint64_t DeltaTracker::EstimateOnDiskSize() const {
shared_lock<rw_spinlock> lock(component_lock_);
uint64_t size = 0;
for (const shared_ptr<DeltaStore>& ds : redo_delta_stores_) {
size += ds->EstimateSize();
}
for (const shared_ptr<DeltaStore>& ds : undo_delta_stores_) {
size += ds->EstimateSize();
}
return size;
}

uint64_t DeltaTracker::EstimateRedoDeltaOnDiskSize() const {
shared_lock<rw_spinlock> lock(component_lock_);
uint64_t size = 0;
for (const shared_ptr<DeltaStore>& ds : redo_delta_stores_) {
Expand Down
4 changes: 4 additions & 0 deletions src/kudu/tablet/delta_tracker.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,12 @@ class DeltaTracker {
// Return the number of redo delta stores, not including the DeltaMemStore.
size_t CountRedoDeltaStores() const;

// Estimate the number of bytes on disk of all delta blocks.
uint64_t EstimateOnDiskSize() const;

// Estimate the number of bytes on disk of REDO deltas.
uint64_t EstimateRedoDeltaOnDiskSize() const;

// Retrieves the list of column indexes that currently have updates.
void GetColumnIdsWithUpdates(std::vector<ColumnId>* col_ids) const;

Expand Down
40 changes: 40 additions & 0 deletions src/kudu/tablet/diskrowset-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -569,5 +569,45 @@ TEST_F(TestRowSet, TestGCAncientStores) {
ASSERT_EQ(0, dt->CountRedoDeltaStores());
}

TEST_F(TestRowSet, TestDiskSizeEstimation) {
// Force the files to be opened so the stats are read.
FLAGS_cfile_lazy_open = false;

// Write a rowset.
WriteTestRowSet();
shared_ptr<DiskRowSet> rs;
ASSERT_OK(OpenTestRowSet(&rs));

// Write a first delta file.
UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr);
ASSERT_OK(rs->FlushDeltas());

// The rowset consists of base data and REDO deltas, so
// 1. the delta tracker's on-disk estimate should be the same as the on-disk estimate for REDOs.
// 2. the rowset's on-disk estimate and the sum of the base data and REDO estimates should equal.
ASSERT_EQ(rs->delta_tracker()->EstimateOnDiskSize(),
rs->delta_tracker()->EstimateRedoDeltaOnDiskSize());
ASSERT_EQ(rs->EstimateOnDiskSize(),
rs->EstimateBaseDataDiskSize() + rs->EstimateRedoDeltaDiskSize());

// Convert the REDO delta to an UNDO delta.
// REDO size should be zero, but there should be UNDOs, so the on-disk size of the rowset
// should be larger than the base data.
ASSERT_OK(rs->MajorCompactDeltaStores(HistoryGcOpts::Disabled()));
ASSERT_EQ(0, rs->EstimateRedoDeltaDiskSize());
ASSERT_GT(rs->EstimateOnDiskSize(), rs->EstimateBaseDataDiskSize());

// Write a second delta file.
UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr);
ASSERT_OK(rs->FlushDeltas());

// There's base data, REDOs, and UNDOs, so the delta tracker and rowset's sizes should be larger
// than estimates counting only base data and REDOs.
ASSERT_GT(rs->delta_tracker()->EstimateOnDiskSize(),
rs->delta_tracker()->EstimateRedoDeltaOnDiskSize());
ASSERT_GT(rs->EstimateOnDiskSize(),
rs->EstimateBaseDataDiskSize() + rs->EstimateRedoDeltaDiskSize());
}

} // namespace tablet
} // namespace kudu
12 changes: 9 additions & 3 deletions src/kudu/tablet/diskrowset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -680,10 +680,10 @@ uint64_t DiskRowSet::EstimateBaseDataDiskSize() const {
return base_data_->EstimateOnDiskSize();
}

uint64_t DiskRowSet::EstimateDeltaDiskSize() const {
uint64_t DiskRowSet::EstimateRedoDeltaDiskSize() const {
DCHECK(open_);
shared_lock<rw_spinlock> l(component_lock_);
return delta_tracker_->EstimateOnDiskSize();
return delta_tracker_->EstimateRedoDeltaOnDiskSize();
}

uint64_t DiskRowSet::EstimateOnDiskSize() const {
Expand All @@ -692,6 +692,12 @@ uint64_t DiskRowSet::EstimateOnDiskSize() const {
return base_data_->EstimateOnDiskSize() + delta_tracker_->EstimateOnDiskSize();
}

uint64_t DiskRowSet::EstimateCompactionSize() const {
DCHECK(open_);
shared_lock<rw_spinlock> l(component_lock_);
return base_data_->EstimateOnDiskSize() + delta_tracker_->EstimateRedoDeltaOnDiskSize();
}

size_t DiskRowSet::DeltaMemStoreSize() const {
DCHECK(open_);
return delta_tracker_->DeltaMemStoreSize();
Expand Down Expand Up @@ -739,7 +745,7 @@ double DiskRowSet::DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType
delta_tracker_->GetColumnIdsWithUpdates(&col_ids_with_updates);
// If we have files but no updates, we don't want to major compact.
if (!col_ids_with_updates.empty()) {
double ratio = static_cast<double>(EstimateDeltaDiskSize()) / base_data_size;
double ratio = static_cast<double>(EstimateRedoDeltaDiskSize()) / base_data_size;
if (ratio >= FLAGS_tablet_delta_store_major_compact_min_ratio) {
perf_improv = ratio;
}
Expand Down
10 changes: 6 additions & 4 deletions src/kudu/tablet/diskrowset.h
Original file line number Diff line number Diff line change
Expand Up @@ -331,13 +331,15 @@ class DiskRowSet : public RowSet {
// Estimate the number of bytes on-disk for the base data.
uint64_t EstimateBaseDataDiskSize() const;

// Estimate the number of bytes on-disk for the delta stores.
uint64_t EstimateDeltaDiskSize() const;
// Estimate the number of bytes on-disk of REDO deltas.
uint64_t EstimateRedoDeltaDiskSize() const;

// Estimate the total number of bytes on-disk, excluding the bloom files and the ad hoc index.
// TODO Offer a version that has the real total disk space usage.
// Estimate the total number of bytes on-disk. Excludes the bloom files and the ad hoc index.
// TODO(wdberkeley) Offer a version that has the real total disk space usage. See KUDU-1755.
uint64_t EstimateOnDiskSize() const OVERRIDE;

uint64_t EstimateCompactionSize() const OVERRIDE;

size_t DeltaMemStoreSize() const OVERRIDE;

bool DeltaMemStoreEmpty() const OVERRIDE;
Expand Down
4 changes: 4 additions & 0 deletions src/kudu/tablet/memrowset.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,10 @@ class MemRowSet : public RowSet,
return 0;
}

uint64_t EstimateCompactionSize() const OVERRIDE {
return 0;
}

std::mutex *compact_flush_lock() OVERRIDE {
return &compact_flush_lock_;
}
Expand Down
8 changes: 8 additions & 0 deletions src/kudu/tablet/mock-rowsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ class MockRowSet : public RowSet {
LOG(FATAL) << "Unimplemented";
return 0;
}
virtual uint64_t EstimateCompactionSize() const OVERRIDE {
LOG(FATAL) << "Unimplemented";
return 0;
}
virtual std::mutex *compact_flush_lock() OVERRIDE {
LOG(FATAL) << "Unimplemented";
return NULL;
Expand Down Expand Up @@ -164,6 +168,10 @@ class MockDiskRowSet : public MockRowSet {
return size_;
}

virtual uint64_t EstimateCompactionSize() const OVERRIDE {
return size_;
}

virtual std::string ToString() const OVERRIDE {
return strings::Substitute("mock[$0, $1]",
Slice(first_key_).ToDebugString(),
Expand Down
10 changes: 9 additions & 1 deletion src/kudu/tablet/rowset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,19 @@ Status DuplicatingRowSet::GetBounds(string* min_encoded_key,
}

uint64_t DuplicatingRowSet::EstimateOnDiskSize() const {
uint64_t size = 0;
for (const shared_ptr<RowSet> &rs : new_rowsets_) {
size += rs->EstimateOnDiskSize();
}
return size;
}

uint64_t DuplicatingRowSet::EstimateCompactionSize() const {
// The actual value of this doesn't matter, since it won't be selected
// for compaction.
uint64_t size = 0;
for (const shared_ptr<RowSet> &rs : new_rowsets_) {
size += rs->EstimateOnDiskSize();
size += rs->EstimateCompactionSize();
}
return size;
}
Expand Down
5 changes: 5 additions & 0 deletions src/kudu/tablet/rowset.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ class RowSet {
// Estimate the number of bytes on-disk
virtual uint64_t EstimateOnDiskSize() const = 0;

// Estimate the number of bytes relevant for compaction.
virtual uint64_t EstimateCompactionSize() const = 0;

// Return the lock used for including this DiskRowSet in a compaction.
// This prevents multiple compactions and flushes from trying to include
// the same rowset.
Expand Down Expand Up @@ -328,6 +331,8 @@ class DuplicatingRowSet : public RowSet {

uint64_t EstimateOnDiskSize() const OVERRIDE;

uint64_t EstimateCompactionSize() const OVERRIDE;

string ToString() const OVERRIDE;

virtual Status DebugDump(vector<string> *lines = NULL) OVERRIDE;
Expand Down
2 changes: 1 addition & 1 deletion src/kudu/tablet/rowset_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ void RowSetInfo::CollectOrdered(const RowSetTree& tree,

RowSetInfo::RowSetInfo(RowSet* rs, double init_cdf)
: rowset_(rs),
size_bytes_(rs->EstimateOnDiskSize()),
size_bytes_(rs->EstimateCompactionSize()),
size_mb_(std::max(implicit_cast<int>(size_bytes_ / 1024 / 1024), kMinSizeMb)),
cdf_min_key_(init_cdf),
cdf_max_key_(init_cdf) {
Expand Down
7 changes: 2 additions & 5 deletions src/kudu/tablet/tablet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,9 @@ METRIC_DEFINE_gauge_size(tablet, memrowset_size, "MemRowSet Memory Usage",
"Size of this tablet's memrowset");
METRIC_DEFINE_gauge_size(tablet, on_disk_size, "Tablet Size On Disk",
kudu::MetricUnit::kBytes,
"Size of this tablet on disk.");
"Space used by this tablet's data blocks.");

using base::subtle::Barrier_AtomicIncrement;
using kudu::MaintenanceManager;
using kudu::consensus::OpId;
using kudu::consensus::MaximumOpId;
using kudu::log::LogAnchorRegistry;
using kudu::server::HybridClock;
using std::shared_ptr;
Expand Down Expand Up @@ -1470,7 +1467,7 @@ Status Tablet::DoMergeCompactionOrFlush(const RowSetsInCompaction &input,
if (input.num_rowsets() > 1) {
MAYBE_FAULT(FLAGS_fault_crash_before_flush_tablet_meta_after_compaction);
} else if (input.num_rowsets() == 1 &&
input.rowsets()[0]->EstimateOnDiskSize() == 0) {
input.rowsets()[0]->EstimateCompactionSize() == 0) {
MAYBE_FAULT(FLAGS_fault_crash_before_flush_tablet_meta_after_flush_mrs);
}

Expand Down

0 comments on commit 1aa0ebb

Please sign in to comment.