Skip to content

Commit

Permalink
Fix the cost estimate of SORT, s.t. the pubchem queries get the corre…
Browse files Browse the repository at this point in the history
…ct query plans.

Signed-off-by: Johannes Kalmbach <[email protected]>
  • Loading branch information
joka921 committed Dec 5, 2024
1 parent ec806f0 commit 07aa566
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 12 deletions.
47 changes: 47 additions & 0 deletions src/engine/Sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

#include "engine/CallFixedSize.h"
#include "engine/Engine.h"
#include "engine/Filter.h"
#include "engine/IndexScan.h"
#include "engine/QueryExecutionTree.h"
#include "global/RuntimeParameters.h"

Expand Down Expand Up @@ -74,3 +76,48 @@ ProtoResult Sort::computeResult([[maybe_unused]] bool requestLaziness) {
LOG(DEBUG) << "Sort result computation done." << endl;
return {std::move(idTable), resultSortedOn(), subRes->getSharedLocalVocab()};
}

// _____________________________________________________________________________
size_t Sort::getCostEstimate() {
size_t size = getSizeEstimateBeforeLimit();
size_t logSize =
size < 4 ? 2 : static_cast<size_t>(logb(static_cast<double>(size)));
size_t nlogn = size * logSize;
size_t subcost = subtree_->getCostEstimate();
// Return at least 1, s.t. the query planner will never emit an unnecessary
// sort of an empty `IndexScan`. This makes the testing of the query
// planner much easier.

// Don't return plain `n log n` but also incorporate the number of columns and
// a constant multiplicator for the inherent complexity of sorting.
auto result = std::max(1UL, 20 * getResultWidth() * (nlogn + subcost));

// Determine if the subtree is a FILTER of an INDEX SCAN. This case can be
// useful if the FILTER can be applied via binary search and the result is
// then so small that the SORT doesn't hurt anymore. But in case the FILTER
// doesn't filter out much, and the result size is beyond a configurable
// threshold, we want to heavily discourage the plan with the binary filter +
// sorting, because it breaks the lazy evaluation.
auto sizeEstimateOfFilteredScan = [&]() -> size_t {
if (auto filter =
dynamic_cast<const Filter*>(subtree_->getRootOperation().get())) {
if (dynamic_cast<const IndexScan*>(
filter->getSubtree()->getRootOperation().get())) {
return subtree_->getSizeEstimate();
}
}
return 0;
}();
size_t maxSizeFilteredScan =
RuntimeParameters()
.get<"max-materialization-size-filtered-scan">()
.getBytes() /
sizeof(Id) / subtree_->getResultWidth();
if (sizeEstimateOfFilteredScan > maxSizeFilteredScan) {
// If the filtered result is larger than the defined threshold, make the
// cost estimate much larger, s.t. the query planner will prefer a plan
// without the `SORT`.
result *= 10'000;
}

Check warning on line 121 in src/engine/Sort.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Sort.cpp#L120-L121

Added lines #L120 - L121 were not covered by tests
return result;
}
12 changes: 1 addition & 11 deletions src/engine/Sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,7 @@ class Sort : public Operation {

std::shared_ptr<QueryExecutionTree> getSubtree() const { return subtree_; }

virtual size_t getCostEstimate() override {
size_t size = getSizeEstimateBeforeLimit();
size_t logSize =
size < 4 ? 2 : static_cast<size_t>(logb(static_cast<double>(size)));
size_t nlogn = size * logSize;
size_t subcost = subtree_->getCostEstimate();
// Return at least 1, s.t. the query planner will never emit an unnecessary
// sort of an empty `IndexScan`. This makes the testing of the query
// planner much easier.
return std::max(1UL, nlogn + subcost);
}
size_t getCostEstimate() override;

virtual bool knownEmptyResult() override {
return subtree_->knownEmptyResult();
Expand Down
4 changes: 3 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ inline auto& RuntimeParameters() {
Bool<"throw-on-unbound-variables">{false},
// Control up until which size lazy results should be cached. Caching
// does cause significant overhead for this case.
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}};
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB},
MemorySizeParameter<"max-materialization-size-filtered-scan">{100_MB},
};
}();
return params;
}
Expand Down

0 comments on commit 07aa566

Please sign in to comment.