Skip to content

Commit

Permalink
bloom filter for ngramms (ydb-platform#12893)
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanmorozov333 authored and zverevgeny committed Jan 8, 2025
1 parent 0cc5b1a commit 40de793
Show file tree
Hide file tree
Showing 16 changed files with 809 additions and 305 deletions.
196 changes: 132 additions & 64 deletions ydb/core/kqp/ut/olap/indexes_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,8 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
}

AFL_VERIFY(updatesCount + 6 ==
(ui64)csController->GetActualizationRefreshSchemeCount().Val())(
"updates", updatesCount)("count",
csController->GetActualizationRefreshSchemeCount().Val());
(ui64)csController->GetActualizationRefreshSchemeCount().Val())("updates", updatesCount)(
"count", csController->GetActualizationRefreshSchemeCount().Val());
}

class TTestIndexesScenario {
Expand All @@ -313,14 +312,38 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
std::unique_ptr<TKikimrRunner> Kikimr;
YDB_ACCESSOR(TString, StorageId, "__DEFAULT");

ui64 SkipStart = 0;
ui64 NoDataStart = 0;
ui64 ApproveStart = 0;

template <class TController>
void ResetZeroLevel(TController& g) {
SkipStart = g->GetIndexesSkippingOnSelect().Val();
ApproveStart = g->GetIndexesApprovedOnSelect().Val();
NoDataStart = g->GetIndexesSkippedNoData().Val();
}

void ExecuteSQL(const TString& text, const TString& expectedResult) const {
auto tableClient = Kikimr->GetTableClient();
auto it = tableClient.StreamExecuteScanQuery(text).GetValueSync();
UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
TString result = StreamResultToYson(it);
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("result", result)("expected", expectedResult);
auto* controller = NYDBTest::TControllers::GetControllerAs<NOlap::TWaitCompactionController>();
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("skip", controller->GetIndexesSkippingOnSelect().Val() - SkipStart)(
"check", controller->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
"no_data", controller->GetIndexesSkippedNoData().Val() - NoDataStart);
CompareYson(result, expectedResult);
}

public:
TTestIndexesScenario& Initialize() {
Settings = TKikimrSettings().SetWithSampleTables(false);
Kikimr = std::make_unique<TKikimrRunner>(Settings);
return *this;
}

void Execute() const {
void Execute() {
auto csController = NYDBTest::TControllers::RegisterCSControllerGuard<NOlap::TWaitCompactionController>();
csController->SetOverrideReduceMemoryIntervalLimit(1LLU << 30);
csController->SetOverrideMemoryLimitForPortionReading(1e+10);
Expand All @@ -340,6 +363,17 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
auto alterResult = session.ExecuteSchemeQuery(alterQuery).GetValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(alterResult.GetStatus(), NYdb::EStatus::SUCCESS, alterResult.GetIssues().ToString());
}
{
auto alterQuery =
TStringBuilder() << Sprintf(
R"(ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=UPSERT_INDEX, NAME=index_ngramm_uid, TYPE=BLOOM_NGRAMM_FILTER,
FEATURES=`{"column_name" : "resource_id", "ngramm_size" : 3, "hashes_count" : 2, "filter_size_bytes" : 64024}`);
)",
StorageId.data());
auto session = tableClient.CreateSession().GetValueSync().GetSession();
auto alterResult = session.ExecuteSchemeQuery(alterQuery).GetValueSync();
UNIT_ASSERT_VALUES_EQUAL_C(alterResult.GetStatus(), NYdb::EStatus::SUCCESS, alterResult.GetIssues().ToString());
}
{
auto alterQuery =
TStringBuilder() << Sprintf(
Expand Down Expand Up @@ -384,22 +418,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
filler(3000000, 100000000, 110000);
}

{
auto it = tableClient
.StreamExecuteScanQuery(R"(
--!syntax_v1
SELECT
COUNT(*)
FROM `/Root/olapStore/olapTable`
)")
.GetValueSync();

UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
TString result = StreamResultToYson(it);
Cout << result << Endl;
CompareYson(result, R"([[230000u;]])");
}
ExecuteSQL(R"(SELECT COUNT(*) FROM `/Root/olapStore/olapTable`)", "[[230000u;]]");

AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() == 0);
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() == 0);
Expand All @@ -417,54 +436,103 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
AFL_VERIFY(csController->GetCompactionStartedCounter().Val() == 21)("count", csController->GetCompactionStartedCounter().Val());

{
auto it = tableClient
.StreamExecuteScanQuery(R"(
--!syntax_v1
ExecuteSQL(R"(SELECT COUNT(*)
FROM `/Root/olapStore/olapTable`
WHERE resource_id LIKE '%110a151' AND resource_id LIKE '110a%' AND resource_id LIKE '%dd%')", "[[0u;]]");
AFL_VERIFY(!csController->GetIndexesApprovedOnSelect().Val());
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val());
}
{
ResetZeroLevel(csController);
ExecuteSQL(R"(SELECT COUNT(*)
FROM `/Root/olapStore/olapTable`
WHERE resource_id LIKE '%110a151%')", "[[0u;]]");
AFL_VERIFY(!csController->GetIndexesApprovedOnSelect().Val());
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}
{
ResetZeroLevel(csController);
ExecuteSQL(R"(SELECT COUNT(*)
FROM `/Root/olapStore/olapTable`
WHERE ((resource_id = '2' AND level = 222222) OR (resource_id = '1' AND level = 111111) OR (resource_id LIKE '%11dd%')) AND uid = '222')", "[[0u;]]");

SELECT
COUNT(*)
FROM `/Root/olapStore/olapTable`
WHERE ((resource_id = '2' AND level = 222222) OR (resource_id = '1' AND level = 111111) OR (resource_id LIKE '%11dd%')) AND uid = '222'
)")
.GetValueSync();

UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
TString result = StreamResultToYson(it);
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("result", result);
AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("skip", csController->GetIndexesSkippingOnSelect().Val())(
"check", csController->GetIndexesApprovedOnSelect().Val());
CompareYson(result, R"([[0u;]])");
if (StorageId == "__LOCAL_METADATA") {
AFL_VERIFY(csController->GetIndexesSkippedNoData().Val());
} else {
AFL_VERIFY(csController->GetIndexesSkippedNoData().Val() == 0)("val", csController->GetIndexesSkippedNoData().Val());
AFL_VERIFY(csController->GetIndexesSkippedNoData().Val() == 0)("val", csController->GetIndexesSkippedNoData().Val());
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() - ApproveStart < csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}
{
ResetZeroLevel(csController);
ui32 requestsCount = 100;
for (ui32 i = 0; i < requestsCount; ++i) {
const ui32 idx = RandomNumber<ui32>(uids.size());
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
TStringBuilder sb;
sb << "SELECT COUNT(*) FROM `/Root/olapStore/olapTable`" << Endl;
sb << "WHERE(" << Endl;
sb << "resource_id = '" << res << "' AND" << Endl;
sb << "uid= '" << uid << "' AND" << Endl;
sb << "level= " << level << Endl;
sb << ")";
return sb;
};
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
}
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() < csController->GetIndexesSkippingOnSelect().Val());
AFL_VERIFY((csController->GetIndexesApprovedOnSelect().Val() - ApproveStart) * 5 < csController->GetIndexesSkippingOnSelect().Val() - SkipStart)
("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}
ui32 requestsCount = 100;
for (ui32 i = 0; i < requestsCount; ++i) {
const ui32 idx = RandomNumber<ui32>(uids.size());
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
TStringBuilder sb;
sb << "SELECT COUNT(*) FROM `/Root/olapStore/olapTable`" << Endl;
sb << "WHERE(" << Endl;
sb << "resource_id = '" << res << "' AND" << Endl;
sb << "uid= '" << uid << "' AND" << Endl;
sb << "level= " << level << Endl;
sb << ")";
return sb;
};
auto it = tableClient.StreamExecuteScanQuery(query(resourceIds[idx], uids[idx], levels[idx])).GetValueSync();

UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
TString result = StreamResultToYson(it);
Cout << csController->GetIndexesSkippingOnSelect().Val() << " / " << csController->GetIndexesApprovedOnSelect().Val() << " / "
<< csController->GetIndexesSkippedNoData().Val() << Endl;
CompareYson(result, R"([[1u;]])");
{
ResetZeroLevel(csController);
ui32 requestsCount = 100;
for (ui32 i = 0; i < requestsCount; ++i) {
const ui32 idx = RandomNumber<ui32>(uids.size());
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
TStringBuilder sb;
sb << "SELECT COUNT(*) FROM `/Root/olapStore/olapTable`" << Endl;
sb << "WHERE" << Endl;
sb << "resource_id LIKE '%" << res << "%'" << Endl;
return sb;
};
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
}
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}
{
ResetZeroLevel(csController);
ui32 requestsCount = 100;
for (ui32 i = 0; i < requestsCount; ++i) {
const ui32 idx = RandomNumber<ui32>(uids.size());
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
TStringBuilder sb;
sb << "SELECT COUNT(*) FROM `/Root/olapStore/olapTable`" << Endl;
sb << "WHERE" << Endl;
sb << "resource_id LIKE '" << res << "%'" << Endl;
return sb;
};
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
}
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}
{
ResetZeroLevel(csController);
ui32 requestsCount = 100;
for (ui32 i = 0; i < requestsCount; ++i) {
const ui32 idx = RandomNumber<ui32>(uids.size());
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
TStringBuilder sb;
sb << "SELECT COUNT(*) FROM `/Root/olapStore/olapTable`" << Endl;
sb << "WHERE" << Endl;
sb << "resource_id LIKE '%" << res << "'" << Endl;
return sb;
};
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
}
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
}

AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() * 5 < csController->GetIndexesSkippingOnSelect().Val())
("approved", csController->GetIndexesApprovedOnSelect().Val())("skipped", csController->GetIndexesSkippingOnSelect().Val());
}
};

Expand Down
16 changes: 16 additions & 0 deletions ydb/core/protos/flat_scheme_op.proto
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,13 @@ message TRequestedBloomFilter {
repeated string ColumnNames = 3;
}

message TRequestedBloomNGrammFilter {
optional uint32 NGrammSize = 1;
optional uint32 FilterSizeBytes = 2;
optional uint32 HashesCount = 3;
optional string ColumnName = 4;
}

message TRequestedMaxIndex {
optional string ColumnName = 1;
}
Expand All @@ -488,6 +495,7 @@ message TOlapIndexRequested {
TRequestedBloomFilter BloomFilter = 40;
TRequestedMaxIndex MaxIndex = 41;
TRequestedCountMinSketch CountMinSketch = 42;
TRequestedBloomNGrammFilter BloomNGrammFilter = 43;
}
}

Expand All @@ -497,6 +505,13 @@ message TBloomFilter {
repeated uint32 ColumnIds = 3;
}

message TBloomNGrammFilter {
optional uint32 NGrammSize = 1;
optional uint32 FilterSizeBytes = 2;
optional uint32 HashesCount = 3;
optional uint32 ColumnId = 4;
}

message TMaxIndex {
optional uint32 ColumnId = 1;
}
Expand All @@ -519,6 +534,7 @@ message TOlapIndexDescription {
TBloomFilter BloomFilter = 41;
TMaxIndex MaxIndex = 42;
TCountMinSketch CountMinSketch = 43;
TBloomNGrammFilter BloomNGrammFilter = 44;
}
}

Expand Down
Loading

0 comments on commit 40de793

Please sign in to comment.