From d893bd239fde881a1fb2546be1616cb8f71f570b Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 14:01:03 +0300 Subject: [PATCH 01/11] bloom ngramms speed up --- .../engines/storage/indexes/bloom/checker.h | 22 ++++ .../storage/indexes/bloom_ngramm/meta.cpp | 121 ++++++++++++------ .../storage/indexes/bloom_ngramm/meta.h | 20 --- 3 files changed, 104 insertions(+), 59 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h index 740af9f1720d..23d46e21b557 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h @@ -11,6 +11,28 @@ class TFixStringBitsStorage { : Data(data) {} + TFixStringBitsStorage(const std::vector& bitsVector) + : TFixStringBitsStorage(bitsVector.size()) { + ui32 byteIdx = 0; + ui8 byteCurrent = 0; + ui8 shiftCurrent = 0; + for (ui32 i = 0; i < bitsVector.size(); ++i) { + if (i && i % 8 == 0) { + Data[byteIdx] = (char)byteCurrent; + byteCurrent = 0; + shiftCurrent = 1; + ++byteIdx; + } + if (bitsVector[i]) { + byteCurrent += shiftCurrent; + } + shiftCurrent = (shiftCurrent << 1); + } + if (byteCurrent) { + Data[byteIdx] = (char)byteCurrent; + } + } + ui32 GetSizeBits() const { return Data.size() * 8; } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index af139065b9cf..9a4157421a33 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -15,8 +15,70 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm { class TNGrammBuilder { private: - NArrow::NHash::NXX64::TStreamStringHashCalcer HashCalcer; TBuffer Zeros; + const ui32 HashesCount; + + static const ui64 HashesConstructorP = 9223372036854775783; + static const ui64 HashesConstructorA = 1; + + template + class THashesBuilder { + public: + template + static void Build(const ui64 originalHash, const TActor& actor) { + actor((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP); + } + }; + + template <> + class THashesBuilder<0> { + public: + template + static void Build(const ui64 /*originalHash*/, const TActor& /*actor*/) { + } + }; + + template + void BuildHashesSet(const ui64 originalHash, const TActor& actor) const { + if (HashesCount == 1) { + THashesBuilder<1>::Build(originalHash, actor); + } else if (HashesCount == 2) { + THashesBuilder<2>::Build(originalHash, actor); + } else if (HashesCount == 3) { + THashesBuilder<3>::Build(originalHash, actor); + } else if (HashesCount == 4) { + THashesBuilder<4>::Build(originalHash, actor); + } else if (HashesCount == 5) { + THashesBuilder<5>::Build(originalHash, actor); + } else if (HashesCount == 6) { + THashesBuilder<6>::Build(originalHash, actor); + } else if (HashesCount == 7) { + THashesBuilder<7>::Build(originalHash, actor); + } else if (HashesCount == 8) { + THashesBuilder<8>::Build(originalHash, actor); + } else { + for (ui32 b = 1; b <= HashesCount; ++b) { + const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP; + actor(hash); + } + } + } + + ui64 CalcHash(const char* data, const ui32 size) const { + if (size == 3) { + return ((ui64)data[0]) | (((ui64)data[1]) << 8) | (((ui64)data[2]) << 16); + } else if (size == 4) { + return *(ui32*)&data[0]; + } else { + uint64_t h = 2166136261; + for (size_t i = 0; i < size; i++) { + h = h ^ uint64_t(data[i]); + h = h * 16777619; + } + return h; + } + } + template void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, const TAction& pred) const { @@ -28,24 +90,26 @@ class TNGrammBuilder { if (fakeStart.size() < nGrammSize) { fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size()); } - pred(fakeStart.data()); + BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred); } } for (ui32 c = 0; c < dataSize; ++c) { if (c + nGrammSize <= dataSize) { - pred(data + c); + pred(CalcHash(data + c, nGrammSize)); } else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { TBuffer fakeStart; fakeStart.Append(data + c, dataSize - c); fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size()); - pred(fakeStart.data()); + BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred); } } } public: - TNGrammBuilder() - : HashCalcer(0) { + TNGrammBuilder(const ui32 hashesCount) + : HashesCount(hashesCount) + { + AFL_VERIFY((ui64)HashesCount < HashesConstructorP); Zeros.Fill('\0', 1024); } @@ -64,15 +128,7 @@ class TNGrammBuilder { } if constexpr (arrow::has_string_view()) { auto value = typedArray.GetView(row); - if (value.size() < nGrammSize) { - continue; - } - const auto pred = [&](const char* data) { - HashCalcer.Start(); - HashCalcer.Update((const ui8*)data, nGrammSize); - fillData(HashCalcer.Finish()); - }; - BuildNGramms(value.data(), value.size(), {}, nGrammSize, pred); + BuildNGramms(value.data(), value.size(), {}, nGrammSize, fillData); } else { AFL_VERIFY(false); } @@ -83,33 +139,23 @@ class TNGrammBuilder { template void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) { - const auto pred = [&](const char* value) { - HashCalcer.Start(); - HashCalcer.Update((const ui8*)value, nGrammSize); - fillData(HashCalcer.Finish()); - }; - BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, pred); + BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, fillData); } }; TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount()); - TNGrammBuilder builder; + TNGrammBuilder builder(HashesCount); - TFixStringBitsStorage bits(FilterSizeBytes * 8); - - const auto pred = [&](const ui64 hash) { - const auto predSet = [&](const ui64 hashSecondary) { - bits.Set(true, hashSecondary % bits.GetSizeBits()); - }; - BuildHashesSet(hash, predSet); + std::vector bitsVector(FilterSizeBytes * 8, false); + const auto predSet = [&](const ui64 hashSecondary) { + bitsVector[hashSecondary % (FilterSizeBytes * 8)] = true; }; for (reader.Start(); reader.IsCorrect();) { - builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), pred); + builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet); reader.ReadNext(reader.begin()->GetCurrentChunk()->length()); } - - return bits.GetData(); + return TFixStringBitsStorage(bitsVector).GetData(); } void TIndexMeta::DoFillIndexCheckers( @@ -133,16 +179,13 @@ void TIndexMeta::DoFillIndexCheckers( } std::set hashes; - const auto pred = [&](const ui64 hash) { - const auto predSet = [&](const ui64 hashSecondary) { - hashes.emplace(hashSecondary); - }; - BuildHashesSet(hash, predSet); + const auto predSet = [&](const ui64 hashSecondary) { + hashes.emplace(hashSecondary); }; - TNGrammBuilder builder; + TNGrammBuilder builder(HashesCount); for (auto&& c : foundColumns) { for (auto&& ls : c.second.GetLikeSequences()) { - builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), pred); + builder.FillNGrammHashes(NGrammSize, ls.second.GetOperation(), ls.second.GetValue(), predSet); } } branch->MutableIndexes().emplace_back(std::make_shared(GetIndexId(), std::move(hashes))); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h index 98af4556a5a5..c548b958de55 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h @@ -23,26 +23,6 @@ class TIndexMeta: public TIndexByColumns { AFL_VERIFY(NGrammSize > 2); } - static const ui64 HashesConstructorP = ((ui64)2 << 31) - 1; - static const ui64 HashesConstructorA = (ui64)2 << 16; - - template - void BuildHashesSet(const ui64 originalHash, const TActor& actor) const { - AFL_VERIFY(HashesCount < HashesConstructorP); - for (ui32 b = 1; b <= HashesCount; ++b) { - const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP; - actor(hash); - } - } - - template - void BuildHashesSet(const TContainer& originalHashes, const TActor& actor) const { - AFL_VERIFY(HashesCount < HashesConstructorP); - for (auto&& hOriginal : originalHashes) { - BuildHashesSet(hOriginal, actor); - } - } - protected: virtual TConclusionStatus DoCheckModificationCompatibility(const IIndexMeta& /*newMeta*/) const override { return TConclusionStatus::Fail("not supported"); From d967cfcf361fedaa0f0b211a2e9c72eb420c3216 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 14:23:54 +0300 Subject: [PATCH 02/11] improve test --- ydb/core/kqp/ut/olap/indexes_ut.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ydb/core/kqp/ut/olap/indexes_ut.cpp b/ydb/core/kqp/ut/olap/indexes_ut.cpp index 014b5c6abe3e..be8e4e7eae8c 100644 --- a/ydb/core/kqp/ut/olap/indexes_ut.cpp +++ b/ydb/core/kqp/ut/olap/indexes_ut.cpp @@ -482,7 +482,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { } { ResetZeroLevel(csController); - ui32 requestsCount = 100; + ui32 requestsCount = 500; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { @@ -494,12 +494,12 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { }; ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]"); } - AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)( + AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)( "skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart); } { ResetZeroLevel(csController); - ui32 requestsCount = 100; + ui32 requestsCount = 500; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { @@ -511,13 +511,13 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { }; ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]"); } - AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)( + AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)( "approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)( "skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart); } { ResetZeroLevel(csController); - ui32 requestsCount = 100; + ui32 requestsCount = 500; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { @@ -529,7 +529,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { }; ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]"); } - AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)( + AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)( "approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)( "skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart); } From 5d3f63b578cfa8b20510a1b02bf6b1ae25328404 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 15:06:10 +0300 Subject: [PATCH 03/11] speed up --- .../storage/indexes/bloom_ngramm/meta.cpp | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 9a4157421a33..e23d78eb687f 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -82,25 +82,30 @@ class TNGrammBuilder { template void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, const TAction& pred) const { + TBuffer fakeString; + AFL_VERIFY(nGrammSize >= 3)("value", nGrammSize); if (!op || op == NRequest::TLikePart::EOperation::StartsWith) { for (ui32 c = 1; c <= nGrammSize; ++c) { - TBuffer fakeStart; - fakeStart.Fill('\0', nGrammSize - c); - fakeStart.Append(data, std::min(c, dataSize)); - if (fakeStart.size() < nGrammSize) { - fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size()); + fakeString.Clear(); + fakeString.Fill('\0', nGrammSize - c); + fakeString.Append(data, std::min(c, dataSize)); + if (fakeString.size() < nGrammSize) { + fakeString.Fill('\0', nGrammSize - fakeString.size()); } - BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred); + BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred); } } - for (ui32 c = 0; c < dataSize; ++c) { - if (c + nGrammSize <= dataSize) { - pred(CalcHash(data + c, nGrammSize)); - } else if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { - TBuffer fakeStart; - fakeStart.Append(data + c, dataSize - c); - fakeStart.Append(Zeros.data(), nGrammSize - fakeStart.size()); - BuildHashesSet(CalcHash(fakeStart.data(), nGrammSize), pred); + ui32 c = 0; + for (; c + nGrammSize <= dataSize; ++c) { + pred(CalcHash(data + c, nGrammSize)); + } + + if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { + for (; c < dataSize; ++c) { + fakeString.Clear(); + fakeString.Append(data + c, dataSize - c); + fakeString.Fill('\0', nGrammSize - fakeString.size()); + BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred); } } } From 8da475e02accedf657d3e67b4a1eb0af88afd8ea Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 15:23:22 +0300 Subject: [PATCH 04/11] speed up 0.6% --- .../columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index e23d78eb687f..28b158ab3483 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -153,8 +153,9 @@ TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { TNGrammBuilder builder(HashesCount); std::vector bitsVector(FilterSizeBytes * 8, false); + bool* memAccessor = &bitsVector[0]; const auto predSet = [&](const ui64 hashSecondary) { - bitsVector[hashSecondary % (FilterSizeBytes * 8)] = true; + memAccessor[hashSecondary % (FilterSizeBytes * 8)] = true; }; for (reader.Start(); reader.IsCorrect();) { builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet); From fafb00dcfde713be261f21a78663d3f49325e91e Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 16:38:53 +0300 Subject: [PATCH 05/11] speed up --- .../engines/storage/indexes/bloom_ngramm/meta.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 28b158ab3483..514f1fc9dff7 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -66,9 +66,14 @@ class TNGrammBuilder { ui64 CalcHash(const char* data, const ui32 size) const { if (size == 3) { - return ((ui64)data[0]) | (((ui64)data[1]) << 8) | (((ui64)data[2]) << 16); + return (*(const ui32*)data) & 0x00FFFFFF; +// TStringBuilder sb; +// sb << res << "/" << (ui32)((ui8*)&res)[0] << "/" << (ui32)((ui8*)&res)[1] << "/" << (ui32)((ui8*)&res)[2] << "/" +// << (ui32)((ui8*)&res)[3] << " vs " << (ui64)data[0] << "/" << (((ui64)data[1])) << "/" << (((ui64)data[2])) << Endl; +// Cerr << sb; +// return (ui64(*(const ui32*)data)) >> 8; } else if (size == 4) { - return *(ui32*)&data[0]; + return *(const ui32*)data; } else { uint64_t h = 2166136261; for (size_t i = 0; i < size; i++) { From ea682c9a6d1ead5a95ec19a529478b56260240ae Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 16:41:10 +0300 Subject: [PATCH 06/11] correct signals --- .../engines/reader/common_reader/iterator/fetching.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h b/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h index 14ca43ec9960..bd972ec728ff 100644 --- a/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h +++ b/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h @@ -27,12 +27,12 @@ class TFetchingStepSignals: public NColumnShard::TCommonCountersOwner { public: TFetchingStepSignals(NColumnShard::TCommonCountersOwner&& owner) : TBase(std::move(owner)) - , DurationCounter(TBase::GetDeriviative("duration_ms")) - , BytesCounter(TBase::GetDeriviative("bytes_ms")) { + , DurationCounter(TBase::GetDeriviative("Duration/Us")) + , BytesCounter(TBase::GetDeriviative("Bytes/Count")) { } void AddDuration(const TDuration d) const { - DurationCounter->Add(d.MilliSeconds()); + DurationCounter->Add(d.MicroSeconds()); } void AddBytes(const ui32 v) const { From d10ed7735f75946225721c032957b70db604ff51 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Wed, 25 Dec 2024 16:45:33 +0300 Subject: [PATCH 07/11] correction --- .../engines/reader/common_reader/iterator/fetching.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h b/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h index bd972ec728ff..34b60a608f21 100644 --- a/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h +++ b/ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h @@ -56,7 +56,7 @@ class TFetchingStepsSignalsCollection: public NColumnShard::TCommonCountersOwner public: TFetchingStepsSignalsCollection() - : TBase("scan_steps") { + : TBase("ScanSteps") { } static TFetchingStepSignals GetSignals(const TString& name) { From 9e3fc4381a3b0ed48a5f4491ad21ad139384efca Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Thu, 26 Dec 2024 11:16:26 +0300 Subject: [PATCH 08/11] correct hashes independency --- ydb/core/formats/arrow/permutations.cpp | 1 - .../engines/changes/compaction/merger.cpp | 2 +- .../engines/portions/read_with_blobs.cpp | 2 +- .../columnshard/engines/scheme/index_info.cpp | 4 +- .../columnshard/engines/scheme/index_info.h | 6 +- .../engines/scheme/indexes/abstract/meta.h | 7 +- .../engines/storage/indexes/bloom/checker.h | 7 +- .../engines/storage/indexes/bloom/meta.cpp | 34 +++---- .../engines/storage/indexes/bloom/meta.h | 2 +- .../storage/indexes/bloom_ngramm/const.cpp | 19 ++++ .../storage/indexes/bloom_ngramm/const.h | 31 +++++++ .../indexes/bloom_ngramm/constructor.cpp | 29 +++--- .../storage/indexes/bloom_ngramm/meta.cpp | 89 +++++++------------ .../storage/indexes/bloom_ngramm/meta.h | 2 +- .../storage/indexes/bloom_ngramm/ya.make | 1 + .../storage/indexes/count_min_sketch/meta.cpp | 2 +- .../storage/indexes/count_min_sketch/meta.h | 2 +- .../engines/storage/indexes/max/meta.cpp | 2 +- .../engines/storage/indexes/max/meta.h | 2 +- .../engines/storage/indexes/portions/meta.cpp | 8 +- .../engines/storage/indexes/portions/meta.h | 5 +- ydb/core/tx/columnshard/splitter/chunks.h | 3 +- ydb/library/formats/arrow/hash/xx_hash.cpp | 6 +- ydb/library/formats/arrow/hash/xx_hash.h | 2 +- 24 files changed, 146 insertions(+), 122 deletions(-) create mode 100644 ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.cpp create mode 100644 ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h diff --git a/ydb/core/formats/arrow/permutations.cpp b/ydb/core/formats/arrow/permutations.cpp index 47e8037b8600..2a7804c918ef 100644 --- a/ydb/core/formats/arrow/permutations.cpp +++ b/ydb/core/formats/arrow/permutations.cpp @@ -11,7 +11,6 @@ #include #include -#include namespace NKikimr::NArrow { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp index 5de37fa61545..9df83a6cb2d4 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp @@ -147,7 +147,7 @@ std::vector TMerger::Execute(const std::shared for (auto&& i : packs) { TGeneralSerializedSlice slicePrimary(std::move(i)); auto dataWithSecondary = resultFiltered->GetIndexInfo() - .AppendIndexes(slicePrimary.GetPortionChunksToHash(), SaverContext.GetStoragesManager()) + .AppendIndexes(slicePrimary.GetPortionChunksToHash(), SaverContext.GetStoragesManager(), slicePrimary.GetRecordsCount()) .DetachResult(); TGeneralSerializedSlice slice(dataWithSecondary.GetExternalData(), schemaDetails, Context.Counters.SplitterCounters); diff --git a/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp b/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp index a79207c07acd..6c0409216080 100644 --- a/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp +++ b/ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp @@ -124,7 +124,7 @@ std::optional TReadPortionInfoWithBlobs::SyncP TIndexInfo::TSecondaryData secondaryData; secondaryData.MutableExternalData() = entityChunksNew; for (auto&& i : to->GetIndexInfo().GetIndexes()) { - to->GetIndexInfo().AppendIndex(entityChunksNew, i.first, storages, secondaryData).Validate(); + to->GetIndexInfo().AppendIndex(entityChunksNew, i.first, storages, source.PortionInfo.GetPortionInfo().GetRecordsCount(), secondaryData).Validate(); } const NSplitter::TEntityGroups groups = source.PortionInfo.GetPortionInfo().GetEntityGroupsByStorageId(targetTier, *storages, to->GetIndexInfo()); diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp index 29d40e0032ee..18fce9cf0624 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp @@ -412,11 +412,11 @@ std::shared_ptr TIndexInfo::GetColumnExternalDefaultValueVerified } NKikimr::TConclusionStatus TIndexInfo::AppendIndex(const THashMap>>& originalData, - const ui32 indexId, const std::shared_ptr& operators, TSecondaryData& result) const { + const ui32 indexId, const std::shared_ptr& operators, const ui32 recordsCount, TSecondaryData& result) const { auto it = Indexes.find(indexId); AFL_VERIFY(it != Indexes.end()); auto& index = it->second; - std::shared_ptr chunk = index->BuildIndex(originalData, *this); + std::shared_ptr chunk = index->BuildIndex(originalData, recordsCount, *this); auto opStorage = operators->GetOperatorVerified(index->GetStorageId()); if ((i64)chunk->GetPackedSize() > opStorage->GetBlobSplitSettings().GetMaxBlobSize()) { return TConclusionStatus::Fail("blob size for secondary data (" + ::ToString(indexId) + ") bigger than limit (" + diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.h b/ydb/core/tx/columnshard/engines/scheme/index_info.h index e4b3534cfd14..eaf4f63eb2ea 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.h @@ -313,11 +313,11 @@ struct TIndexInfo: public IIndexInfo { }; [[nodiscard]] TConclusion AppendIndexes(const THashMap>>& primaryData, - const std::shared_ptr& operators) const { + const std::shared_ptr& operators, const ui32 recordsCount) const { TSecondaryData result; result.MutableExternalData() = primaryData; for (auto&& i : Indexes) { - auto conclusion = AppendIndex(primaryData, i.first, operators, result); + auto conclusion = AppendIndex(primaryData, i.first, operators, recordsCount, result); if (conclusion.IsFail()) { return conclusion; } @@ -329,7 +329,7 @@ struct TIndexInfo: public IIndexInfo { std::shared_ptr GetIndexMetaCountMinSketch(const std::set& columnIds) const; [[nodiscard]] TConclusionStatus AppendIndex(const THashMap>>& originalData, - const ui32 indexId, const std::shared_ptr& operators, TSecondaryData& result) const; + const ui32 indexId, const std::shared_ptr& operators, const ui32 recordsCount, TSecondaryData& result) const; /// Returns an id of the column located by name. The name should exists in the schema. ui32 GetColumnIdVerified(const std::string& name) const; diff --git a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/meta.h b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/meta.h index d5185cbca236..55c25ace4869 100644 --- a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/meta.h +++ b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/meta.h @@ -31,7 +31,8 @@ class IIndexMeta { YDB_READONLY(ui32, IndexId, 0); YDB_READONLY(TString, StorageId, IStoragesManager::DefaultStorageId); protected: - virtual std::shared_ptr DoBuildIndex(const THashMap>>& data, const TIndexInfo& indexInfo) const = 0; + virtual std::shared_ptr DoBuildIndex(const THashMap>>& data, + const ui32 recordsCount, const TIndexInfo& indexInfo) const = 0; virtual void DoFillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const = 0; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) = 0; virtual void DoSerializeToProto(NKikimrSchemeOp::TOlapIndexDescription& proto) const = 0; @@ -67,8 +68,8 @@ class IIndexMeta { virtual ~IIndexMeta() = default; - std::shared_ptr BuildIndex(const THashMap>>& data, const TIndexInfo& indexInfo) const { - return DoBuildIndex(data, indexInfo); + std::shared_ptr BuildIndex(const THashMap>>& data, const ui32 recordsCount, const TIndexInfo& indexInfo) const { + return DoBuildIndex(data, recordsCount, indexInfo); } void FillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h index 23d46e21b557..38fc1031085e 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h @@ -11,6 +11,11 @@ class TFixStringBitsStorage { : Data(data) {} + static ui32 GrowBitsCountToByte(const ui32 bitsCount) { + const ui32 bytesCount = bitsCount / 8; + return (bytesCount + ((bitsCount % 8) ? 1 : 0)) * 8; + } + TFixStringBitsStorage(const std::vector& bitsVector) : TFixStringBitsStorage(bitsVector.size()) { ui32 byteIdx = 0; @@ -38,7 +43,7 @@ class TFixStringBitsStorage { } TFixStringBitsStorage(const ui32 sizeBits) - : Data(sizeBits / 8 + ((sizeBits % 8) ? 1 : 0), '\0') { + : Data(GrowBitsCountToByte(sizeBits) / 8, '\0') { } void Set(const bool val, const ui32 idx) { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp index e3e3cf7d4281..c0f0e565779a 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp @@ -10,26 +10,21 @@ namespace NKikimr::NOlap::NIndexes { -TString TBloomIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { - std::set hashes; - { - NArrow::NHash::NXX64::TStreamStringHashCalcer hashCalcer(0); +TString TBloomIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const { + const ui32 bitsCount = TFixStringBitsStorage::GrowBitsCountToByte(HashesCount * recordsCount / std::log(2)); + std::vector filterBits(bitsCount, false); + for (ui32 i = 0; i < HashesCount; ++i) { + NArrow::NHash::NXX64::TStreamStringHashCalcer hashCalcer(i); for (reader.Start(); reader.IsCorrect(); reader.ReadNext()) { hashCalcer.Start(); for (auto&& i : reader) { NArrow::NHash::TXX64::AppendField(i.GetCurrentChunk(), i.GetCurrentRecordIndex(), hashCalcer); } - hashes.emplace(hashCalcer.Finish()); + filterBits[hashCalcer.Finish() % bitsCount] = true; } } - const ui32 bitsCount = HashesCount * hashes.size() / std::log(2); - TFixStringBitsStorage bits(bitsCount); - const auto pred = [&bits](const ui64 hash) { - bits.Set(true, hash % bits.GetSizeBits()); - }; - BuildHashesSet(hashes, pred); - return bits.GetData(); + return TFixStringBitsStorage(filterBits).GetData(); } void TBloomIndexMeta::DoFillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const { @@ -51,15 +46,14 @@ void TBloomIndexMeta::DoFillIndexCheckers(const std::shared_ptr hashes; - const auto pred = [&hashes](const ui64 hash) { - hashes.emplace(hash); - }; - NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(0); - calcer.Start(); - for (auto&& i : foundColumns) { - NArrow::NHash::TXX64::AppendField(i.second, calcer); + for (ui32 i = 0; i < HashesCount; ++i) { + NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(i); + calcer.Start(); + for (auto&& i : foundColumns) { + NArrow::NHash::TXX64::AppendField(i.second, calcer); + } + hashes.emplace(calcer.Finish()); } - BuildHashesSet(calcer.Finish(), pred); branch->MutableIndexes().emplace_back(std::make_shared(GetIndexId(), std::move(hashes))); } } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.h index ac07cd4793de..cfd9bc85cf20 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.h @@ -52,7 +52,7 @@ class TBloomIndexMeta: public TIndexByColumns { } virtual void DoFillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const override; - virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader) const override; + virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const override; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) override { AFL_VERIFY(TBase::DoDeserializeFromProto(proto)); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.cpp new file mode 100644 index 000000000000..c6b4378157b1 --- /dev/null +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.cpp @@ -0,0 +1,19 @@ +#include "const.h" + +#include + +namespace NKikimr::NOlap::NIndexes::NBloomNGramm { + +TString TConstants::GetHashesCountIntervalString() { + return TStringBuilder() << "[" << MinHashesCount << ", " << MaxHashesCount << "]"; +} + +TString TConstants::GetFilterSizeBytesIntervalString() { + return TStringBuilder() << "[" << MinFilterSizeBytes << ", " << MaxFilterSizeBytes << "]"; +} + +TString TConstants::GetNGrammSizeIntervalString() { + return TStringBuilder() << "[" << MinNGrammSize << ", " << MaxNGrammSize << "]"; +} + +} // namespace NKikimr::NOlap::NIndexes::NBloomNGramm diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h new file mode 100644 index 000000000000..058cabb6588c --- /dev/null +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h @@ -0,0 +1,31 @@ +#pragma once +#include +namespace NKikimr::NOlap::NIndexes::NBloomNGramm { + +class TConstants { +public: + static const ui32 MinNGrammSize = 3; + static const ui32 MaxNGrammSize = 8; + static const ui32 MinHashesCount = 1; + static const ui32 MaxHashesCount = 8; + static const ui32 MinFilterSizeBytes = 128; + static const ui32 MaxFilterSizeBytes = 1 << 20; + + static bool CheckNGrammSize(const ui32 value) { + return MinNGrammSize <= value && value <= MaxNGrammSize; + } + + static bool CheckHashesCount(const ui32 value) { + return MinHashesCount <= value && value <= MaxHashesCount; + } + + static bool CheckFilterSizeBytes(const ui32 value) { + return MinFilterSizeBytes <= value && value <= MaxFilterSizeBytes; + } + + static TString GetHashesCountIntervalString(); + static TString GetFilterSizeBytesIntervalString(); + static TString GetNGrammSizeIntervalString(); +}; + +} // namespace NKikimr::NOlap::NIndexes::NBloomNGramm diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/constructor.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/constructor.cpp index e0068eeb21f5..5d43b0500dfb 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/constructor.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/constructor.cpp @@ -1,3 +1,4 @@ +#include "const.h" #include "constructor.h" #include "meta.h" @@ -17,7 +18,7 @@ std::shared_ptr TIndexConstructor::DoCreateIndexMeta( HashesCount, FilterSizeBytes, NGrammSize); } -NKikimr::TConclusionStatus TIndexConstructor::DoDeserializeFromJson(const NJson::TJsonValue& jsonInfo) { +TConclusionStatus TIndexConstructor::DoDeserializeFromJson(const NJson::TJsonValue& jsonInfo) { if (!jsonInfo.Has("column_name")) { return TConclusionStatus::Fail("column_name have to be in bloom ngramm filter features"); } @@ -32,24 +33,26 @@ NKikimr::TConclusionStatus TIndexConstructor::DoDeserializeFromJson(const NJson: return TConclusionStatus::Fail("ngramm_size have to be in bloom filter features as uint field"); } NGrammSize = jsonInfo["ngramm_size"].GetUInteger(); - if (NGrammSize < 3 || NGrammSize > 10) { - return TConclusionStatus::Fail("ngramm_size have to be in bloom ngramm filter in interval [3, 10]"); + if (!TConstants::CheckNGrammSize(NGrammSize)) { + return TConclusionStatus::Fail("ngramm_size have to be in bloom ngramm filter in interval " + TConstants::GetNGrammSizeIntervalString()); } if (!jsonInfo["filter_size_bytes"].IsUInteger()) { return TConclusionStatus::Fail("filter_size_bytes have to be in bloom filter features as uint field"); } FilterSizeBytes = jsonInfo["filter_size_bytes"].GetUInteger(); - if (FilterSizeBytes < 128 || FilterSizeBytes > (1 << 20)) { - return TConclusionStatus::Fail("filter_size_bytes have to be in bloom ngramm filter in interval [128, 1Mb]"); + if (!TConstants::CheckFilterSizeBytes(FilterSizeBytes)) { + return TConclusionStatus::Fail( + "filter_size_bytes have to be in bloom ngramm filter in interval " + TConstants::GetFilterSizeBytesIntervalString()); } if (!jsonInfo["hashes_count"].IsUInteger()) { return TConclusionStatus::Fail("hashes_count have to be in bloom filter features as uint field"); } HashesCount = jsonInfo["hashes_count"].GetUInteger(); - if (HashesCount < 1 || HashesCount > 10) { - return TConclusionStatus::Fail("hashes_count have to be in bloom ngramm filter in interval [1, 10]"); + if (!TConstants::CheckHashesCount(HashesCount)) { + return TConclusionStatus::Fail( + "hashes_count have to be in bloom ngramm filter in interval " + TConstants::GetHashesCountIntervalString()); } return TConclusionStatus::Success(); } @@ -62,16 +65,16 @@ NKikimr::TConclusionStatus TIndexConstructor::DoDeserializeFromProto(const NKiki } auto& bFilter = proto.GetBloomNGrammFilter(); NGrammSize = bFilter.GetNGrammSize(); - if (NGrammSize < 3 || NGrammSize > 10) { - return TConclusionStatus::Fail("NGrammSize have to be in [3, 10]"); + if (!TConstants::CheckNGrammSize(NGrammSize)) { + return TConclusionStatus::Fail("NGrammSize have to be in " + TConstants::GetNGrammSizeIntervalString()); } FilterSizeBytes = bFilter.GetFilterSizeBytes(); - if (FilterSizeBytes < 128 || FilterSizeBytes > (1 << 20)) { - return TConclusionStatus::Fail("FilterSizeBytes have to be in [128, 1Mb]"); + if (!TConstants::CheckFilterSizeBytes(FilterSizeBytes)) { + return TConclusionStatus::Fail("FilterSizeBytes have to be in " + TConstants::GetFilterSizeBytesIntervalString()); } HashesCount = bFilter.GetHashesCount(); - if (HashesCount < 1 || HashesCount > 10) { - return TConclusionStatus::Fail("HashesCount size have to be in [3, 10]"); + if (!TConstants::CheckHashesCount(HashesCount)) { + return TConclusionStatus::Fail("HashesCount size have to be in " + TConstants::GetHashesCountIntervalString()); } ColumnName = bFilter.GetColumnName(); if (!ColumnName) { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 514f1fc9dff7..29829c9ee1ff 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -15,18 +15,16 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm { class TNGrammBuilder { private: - TBuffer Zeros; const ui32 HashesCount; - static const ui64 HashesConstructorP = 9223372036854775783; - static const ui64 HashesConstructorA = 1; - template class THashesBuilder { public: template - static void Build(const ui64 originalHash, const TActor& actor) { - actor((HashesConstructorA * originalHash + HashIdx) % HashesConstructorP); + static void Build(const ui8* data, ui64& h, const TActor& actor) { + h = h ^ uint64_t(*data); + h = h * 16777619; + THashesBuilder::Build(data + 1, h, actor); } }; @@ -34,59 +32,39 @@ class TNGrammBuilder { class THashesBuilder<0> { public: template - static void Build(const ui64 /*originalHash*/, const TActor& /*actor*/) { + static void Build(const ui8* /*data*/, ui64& hash, const TActor& actor) { + actor(hash); } }; template - void BuildHashesSet(const ui64 originalHash, const TActor& actor) const { - if (HashesCount == 1) { - THashesBuilder<1>::Build(originalHash, actor); - } else if (HashesCount == 2) { - THashesBuilder<2>::Build(originalHash, actor); - } else if (HashesCount == 3) { - THashesBuilder<3>::Build(originalHash, actor); - } else if (HashesCount == 4) { - THashesBuilder<4>::Build(originalHash, actor); - } else if (HashesCount == 5) { - THashesBuilder<5>::Build(originalHash, actor); - } else if (HashesCount == 6) { - THashesBuilder<6>::Build(originalHash, actor); - } else if (HashesCount == 7) { - THashesBuilder<7>::Build(originalHash, actor); - } else if (HashesCount == 8) { - THashesBuilder<8>::Build(originalHash, actor); - } else { - for (ui32 b = 1; b <= HashesCount; ++b) { - const ui64 hash = (HashesConstructorA * originalHash + b) % HashesConstructorP; - actor(hash); - } - } - } - - ui64 CalcHash(const char* data, const ui32 size) const { - if (size == 3) { - return (*(const ui32*)data) & 0x00FFFFFF; -// TStringBuilder sb; -// sb << res << "/" << (ui32)((ui8*)&res)[0] << "/" << (ui32)((ui8*)&res)[1] << "/" << (ui32)((ui8*)&res)[2] << "/" -// << (ui32)((ui8*)&res)[3] << " vs " << (ui64)data[0] << "/" << (((ui64)data[1])) << "/" << (((ui64)data[2])) << Endl; -// Cerr << sb; -// return (ui64(*(const ui32*)data)) >> 8; - } else if (size == 4) { - return *(const ui32*)data; - } else { - uint64_t h = 2166136261; - for (size_t i = 0; i < size; i++) { - h = h ^ uint64_t(data[i]); - h = h * 16777619; + void BuildHashesSet(const ui8* data, const ui32 dataSize, const TActor& actor) { + for (ui32 i = 1; i <= HashesCount; ++i) { + ui64 hash = 2166136261 * i; + if (dataSize == 3) { + THashesBuilder<3>::Build(data, hash, actor); + } else if (dataSize == 4) { + THashesBuilder<4>::Build(data, hash, actor); + } else if (dataSize == 5) { + THashesBuilder<5>::Build(data, hash, actor); + } else if (dataSize == 6) { + THashesBuilder<6>::Build(data, hash, actor); + } else if (dataSize == 7) { + THashesBuilder<7>::Build(data, hash, actor); + } else if (dataSize == 8) { + THashesBuilder<8>::Build(data, hash, actor); + } else { + NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(i); + calcer.Start(); + calcer.Update(data, dataSize); + actor(calcer.Finish()); } - return h; } } template void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, - const TAction& pred) const { + const TAction& pred) { TBuffer fakeString; AFL_VERIFY(nGrammSize >= 3)("value", nGrammSize); if (!op || op == NRequest::TLikePart::EOperation::StartsWith) { @@ -97,12 +75,12 @@ class TNGrammBuilder { if (fakeString.size() < nGrammSize) { fakeString.Fill('\0', nGrammSize - fakeString.size()); } - BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred); + BuildHashesSet((const ui8*)fakeString.data(), nGrammSize, pred); } } ui32 c = 0; for (; c + nGrammSize <= dataSize; ++c) { - pred(CalcHash(data + c, nGrammSize)); + BuildHashesSet((const ui8*)(data + c), nGrammSize, pred); } if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { @@ -110,17 +88,14 @@ class TNGrammBuilder { fakeString.Clear(); fakeString.Append(data + c, dataSize - c); fakeString.Fill('\0', nGrammSize - fakeString.size()); - BuildHashesSet(CalcHash(fakeString.data(), nGrammSize), pred); + BuildHashesSet((const ui8*)fakeString.data(), nGrammSize, pred); } } } public: TNGrammBuilder(const ui32 hashesCount) - : HashesCount(hashesCount) - { - AFL_VERIFY((ui64)HashesCount < HashesConstructorP); - Zeros.Fill('\0', 1024); + : HashesCount(hashesCount) { } template @@ -153,7 +128,7 @@ class TNGrammBuilder { } }; -TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { +TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const { AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount()); TNGrammBuilder builder(HashesCount); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h index c548b958de55..562c0471f1a7 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.h @@ -29,7 +29,7 @@ class TIndexMeta: public TIndexByColumns { } virtual void DoFillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const override; - virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader) const override; + virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const override; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) override { AFL_VERIFY(TBase::DoDeserializeFromProto(proto)); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make index bcba53e477ae..ef15149eb3ba 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/ya.make @@ -4,6 +4,7 @@ SRCS( GLOBAL constructor.cpp GLOBAL meta.cpp GLOBAL checker.cpp + const.cpp ) PEERDIR( diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp index 9e78232d00fc..c48b99969640 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp @@ -11,7 +11,7 @@ namespace NKikimr::NOlap::NIndexes::NCountMinSketch { -TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { +TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const { auto sketch = std::unique_ptr(TCountMinSketch::Create()); for (auto& colReader : reader) { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.h index 2c23af1fefdb..cb7abe56c614 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.h @@ -25,7 +25,7 @@ class TIndexMeta: public TIndexByColumns { virtual void DoFillIndexCheckers(const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const override; - virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader) const override; + virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const override; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) override { AFL_VERIFY(TBase::DoDeserializeFromProto(proto)); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp index ac004a9ebabe..20cd31857c7a 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp @@ -9,7 +9,7 @@ namespace NKikimr::NOlap::NIndexes::NMax { -TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader) const { +TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const { std::shared_ptr result; AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount()); { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.h index 8d760e184283..4c2705bc672c 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.h @@ -19,7 +19,7 @@ class TIndexMeta: public TIndexByColumns { virtual void DoFillIndexCheckers( const std::shared_ptr& info, const NSchemeShard::TOlapSchema& schema) const override; - virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader) const override; + virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const override; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) override { AFL_VERIFY(TBase::DoDeserializeFromProto(proto)); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp index 6fe0e1f2ef11..00c024063584 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.cpp @@ -6,7 +6,7 @@ namespace NKikimr::NOlap::NIndexes { std::shared_ptr TIndexByColumns::DoBuildIndex( - const THashMap>>& data, const TIndexInfo& indexInfo) const { + const THashMap>>& data, const ui32 recordsCount, const TIndexInfo& indexInfo) const { AFL_VERIFY(Serializer); AFL_VERIFY(data.size()); std::vector columnReaders; @@ -15,12 +15,8 @@ std::shared_ptr TIndexByColumns::DoBuildIndex AFL_VERIFY(it != data.end()); columnReaders.emplace_back(it->second, indexInfo.GetColumnLoaderVerified(i)); } - ui32 recordsCount = 0; - for (auto&& i : data.begin()->second) { - recordsCount += i->GetRecordsCountVerified(); - } TChunkedBatchReader reader(std::move(columnReaders)); - const TString indexData = DoBuildIndexImpl(reader); + const TString indexData = DoBuildIndexImpl(reader, recordsCount); return std::make_shared(TChunkAddress(GetIndexId(), 0), recordsCount, indexData.size(), indexData); } diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h index 5356d5c4302d..f8e601c80fca 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/portions/meta.h @@ -13,9 +13,10 @@ class TIndexByColumns: public IIndexMeta { protected: std::set ColumnIds; - virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader) const = 0; + virtual TString DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 recordsCount) const = 0; - virtual std::shared_ptr DoBuildIndex(const THashMap>>& data, const TIndexInfo& indexInfo) const override final; + virtual std::shared_ptr DoBuildIndex(const THashMap>>& data, + const ui32 recordsCount, const TIndexInfo& indexInfo) const override final; virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) override; TConclusionStatus CheckSameColumnsForModification(const IIndexMeta& newMeta) const; diff --git a/ydb/core/tx/columnshard/splitter/chunks.h b/ydb/core/tx/columnshard/splitter/chunks.h index 35063d0ae808..315fc18604e8 100644 --- a/ydb/core/tx/columnshard/splitter/chunks.h +++ b/ydb/core/tx/columnshard/splitter/chunks.h @@ -129,8 +129,7 @@ class TChunkedBatchReader { bool IsCorrectFlag = true; public: TChunkedBatchReader(const std::vector& columnReaders) - : Columns(columnReaders) - { + : Columns(columnReaders) { AFL_VERIFY(Columns.size()); for (auto&& i : Columns) { AFL_VERIFY(i.IsCorrect()); diff --git a/ydb/library/formats/arrow/hash/xx_hash.cpp b/ydb/library/formats/arrow/hash/xx_hash.cpp index bc69a160f535..7f13ff13658c 100644 --- a/ydb/library/formats/arrow/hash/xx_hash.cpp +++ b/ydb/library/formats/arrow/hash/xx_hash.cpp @@ -3,15 +3,15 @@ namespace NKikimr::NArrow::NHash::NXX64 { void TStreamStringHashCalcer::Start() { - XXH64_reset(&HashState, Seed); + XXH3_64bits_reset_withSeed(&HashState, Seed); } void TStreamStringHashCalcer::Update(const ui8* data, const ui32 size) { - XXH64_update(&HashState, data, size); + XXH3_64bits_update(&HashState, data, size); } ui64 TStreamStringHashCalcer::Finish() { - return XXH64_digest(&HashState); + return XXH3_64bits_digest(&HashState); } } diff --git a/ydb/library/formats/arrow/hash/xx_hash.h b/ydb/library/formats/arrow/hash/xx_hash.h index 25903f47c6d5..bb82a82c49c4 100644 --- a/ydb/library/formats/arrow/hash/xx_hash.h +++ b/ydb/library/formats/arrow/hash/xx_hash.h @@ -10,7 +10,7 @@ namespace NKikimr::NArrow::NHash::NXX64 { class TStreamStringHashCalcer { private: const ui64 Seed; - XXH64_state_t HashState; + XXH3_state_t HashState; public: TStreamStringHashCalcer(const ui64 seed) : Seed(seed) { From 934720e49d0cdc666cd2073db64e07fd355f86f4 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Thu, 26 Dec 2024 12:38:23 +0300 Subject: [PATCH 09/11] improve filters --- ydb/core/kqp/ut/olap/indexes_ut.cpp | 6 +- .../storage/indexes/bloom_ngramm/const.h | 12 +- .../storage/indexes/bloom_ngramm/meta.cpp | 154 ++++++++++++------ 3 files changed, 111 insertions(+), 61 deletions(-) diff --git a/ydb/core/kqp/ut/olap/indexes_ut.cpp b/ydb/core/kqp/ut/olap/indexes_ut.cpp index be8e4e7eae8c..56361a050ecf 100644 --- a/ydb/core/kqp/ut/olap/indexes_ut.cpp +++ b/ydb/core/kqp/ut/olap/indexes_ut.cpp @@ -482,7 +482,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { } { ResetZeroLevel(csController); - ui32 requestsCount = 500; + ui32 requestsCount = 300; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { @@ -499,7 +499,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { } { ResetZeroLevel(csController); - ui32 requestsCount = 500; + ui32 requestsCount = 300; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { @@ -517,7 +517,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) { } { ResetZeroLevel(csController); - ui32 requestsCount = 500; + ui32 requestsCount = 300; for (ui32 i = 0; i < requestsCount; ++i) { const ui32 idx = RandomNumber(uids.size()); const auto query = [](const TString& res, const TString& uid, const ui32 level) { diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h index 058cabb6588c..1c0e56028806 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/const.h @@ -4,12 +4,12 @@ namespace NKikimr::NOlap::NIndexes::NBloomNGramm { class TConstants { public: - static const ui32 MinNGrammSize = 3; - static const ui32 MaxNGrammSize = 8; - static const ui32 MinHashesCount = 1; - static const ui32 MaxHashesCount = 8; - static const ui32 MinFilterSizeBytes = 128; - static const ui32 MaxFilterSizeBytes = 1 << 20; + static constexpr ui32 MinNGrammSize = 3; + static constexpr ui32 MaxNGrammSize = 8; + static constexpr ui32 MinHashesCount = 1; + static constexpr ui32 MaxHashesCount = 8; + static constexpr ui32 MinFilterSizeBytes = 128; + static constexpr ui32 MaxFilterSizeBytes = 1 << 20; static bool CheckNGrammSize(const ui32 value) { return MinNGrammSize <= value && value <= MaxNGrammSize; diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 29829c9ee1ff..4728b269baa4 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -1,4 +1,5 @@ #include "checker.h" +#include "const.h" #include "meta.h" #include @@ -17,80 +18,129 @@ class TNGrammBuilder { private: const ui32 HashesCount; - template + template class THashesBuilder { public: - template - static void Build(const ui8* data, ui64& h, const TActor& actor) { + static ui64 Build(const ui8* data, ui64& h) { h = h ^ uint64_t(*data); h = h * 16777619; - THashesBuilder::Build(data + 1, h, actor); + return THashesBuilder::Build(data + 1, h); } }; template <> class THashesBuilder<0> { + public: + static ui64 Build(const ui8* /*data*/, ui64& hash) { + return hash; + } + }; + + template + class THashesCountSelector { public: template - static void Build(const ui8* /*data*/, ui64& hash, const TActor& actor) { - actor(hash); + static void BuildHashes(const ui8* data, const TActor& actor) { + ui64 hash = (ui64)2166136261 * (ui64)HashIdx; + actor(THashesBuilder::Build(data, hash)); + THashesCountSelector::BuildHashes(data, actor); } }; - template - void BuildHashesSet(const ui8* data, const ui32 dataSize, const TActor& actor) { - for (ui32 i = 1; i <= HashesCount; ++i) { - ui64 hash = 2166136261 * i; - if (dataSize == 3) { - THashesBuilder<3>::Build(data, hash, actor); - } else if (dataSize == 4) { - THashesBuilder<4>::Build(data, hash, actor); - } else if (dataSize == 5) { - THashesBuilder<5>::Build(data, hash, actor); - } else if (dataSize == 6) { - THashesBuilder<6>::Build(data, hash, actor); - } else if (dataSize == 7) { - THashesBuilder<7>::Build(data, hash, actor); - } else if (dataSize == 8) { - THashesBuilder<8>::Build(data, hash, actor); - } else { - NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(i); - calcer.Start(); - calcer.Update(data, dataSize); - actor(calcer.Finish()); - } + template + class THashesCountSelector<0, CharsCount> { + public: + template + static void BuildHashes(const ui8* /*data*/, const TActor& /*actor*/) { } - } + }; - template - void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, - const TAction& pred) { - TBuffer fakeString; - AFL_VERIFY(nGrammSize >= 3)("value", nGrammSize); - if (!op || op == NRequest::TLikePart::EOperation::StartsWith) { - for (ui32 c = 1; c <= nGrammSize; ++c) { - fakeString.Clear(); - fakeString.Fill('\0', nGrammSize - c); - fakeString.Append(data, std::min(c, dataSize)); - if (fakeString.size() < nGrammSize) { - fakeString.Fill('\0', nGrammSize - fakeString.size()); + template + class THashesSelector { + private: + template + static void BuildHashesImpl( + const ui8* data, const ui32 dataSize, const std::optional op, const TActor& actor) { + TBuffer fakeString; + if (!op || op == NRequest::TLikePart::EOperation::StartsWith) { + for (ui32 c = 1; c <= CharsCount; ++c) { + fakeString.Clear(); + fakeString.Fill('\0', CharsCount - c); + fakeString.Append((const char*)data, std::min((ui32)c, dataSize)); + if (fakeString.size() < CharsCount) { + fakeString.Fill('\0', CharsCount - fakeString.size()); + } + THashesCountSelector::BuildHashes((const ui8*)fakeString.data(), actor); + } + } + ui32 c = 0; + for (; c + CharsCount <= dataSize; ++c) { + THashesCountSelector::BuildHashes(data + c, actor); + } + if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { + for (; c < dataSize; ++c) { + fakeString.Clear(); + fakeString.Append((const char*)data + c, dataSize - c); + fakeString.Fill('\0', CharsCount - fakeString.size()); + THashesCountSelector::BuildHashes((const ui8*)fakeString.data(), actor); } - BuildHashesSet((const ui8*)fakeString.data(), nGrammSize, pred); } - } - ui32 c = 0; - for (; c + nGrammSize <= dataSize; ++c) { - BuildHashesSet((const ui8*)(data + c), nGrammSize, pred); } - if (!op || op == NRequest::TLikePart::EOperation::EndsWith) { - for (; c < dataSize; ++c) { - fakeString.Clear(); - fakeString.Append(data + c, dataSize - c); - fakeString.Fill('\0', nGrammSize - fakeString.size()); - BuildHashesSet((const ui8*)fakeString.data(), nGrammSize, pred); + public: + template + static void BuildHashes(const ui8* data, const ui32 dataSize, const ui32 hashesCount, const ui32 nGrammSize, + const std::optional op, const TActor& actor) { + if (HashesCount == hashesCount && CharsCount == nGrammSize) { + BuildHashesImpl(data, dataSize, op, actor); + } else if (HashesCount > hashesCount && CharsCount > nGrammSize) { + THashesSelector::BuildHashes(data, dataSize, hashesCount, nGrammSize, op, actor); + } else if (HashesCount > hashesCount) { + THashesSelector::BuildHashes(data, dataSize, hashesCount, nGrammSize, op, actor); + } else if (CharsCount > nGrammSize) { + THashesSelector::BuildHashes(data, dataSize, hashesCount, nGrammSize, op, actor); + } else { + AFL_VERIFY(false); } } + }; + + + template + class THashesSelector<0, CharsCount> { + public: + template + static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, + const std::optional /*op*/, const TActor& /*actor*/) { + AFL_VERIFY(false); + } + }; + + template + class THashesSelector { + public: + template + static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, + const std::optional /*op*/, const TActor& /*actor*/) { + AFL_VERIFY(false); + } + }; + + template <> + class THashesSelector<0, 0> { + public: + template + static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, + const std::optional /*op*/, const TActor& /*actor*/) { + AFL_VERIFY(false); + } + }; + + template + void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, + const TAction& pred) { + THashesSelector::BuildHashes( + (const ui8*)data, dataSize, HashesCount, nGrammSize, op, pred); } public: From a86f731990ebbbcf1ab65d8ce5fdc3ca023ec3e2 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Thu, 26 Dec 2024 13:11:38 +0300 Subject: [PATCH 10/11] fixes --- .../engines/storage/indexes/bloom/meta.cpp | 4 +- .../storage/indexes/bloom_ngramm/meta.cpp | 46 ++++++++++++------- ydb/library/formats/arrow/hash/xx_hash.cpp | 18 ++++++-- ydb/library/formats/arrow/hash/xx_hash.h | 16 ++++++- 4 files changed, 61 insertions(+), 23 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp index c0f0e565779a..09b09e21dcf6 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp @@ -14,7 +14,7 @@ TString TBloomIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui3 const ui32 bitsCount = TFixStringBitsStorage::GrowBitsCountToByte(HashesCount * recordsCount / std::log(2)); std::vector filterBits(bitsCount, false); for (ui32 i = 0; i < HashesCount; ++i) { - NArrow::NHash::NXX64::TStreamStringHashCalcer hashCalcer(i); + NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 hashCalcer(i); for (reader.Start(); reader.IsCorrect(); reader.ReadNext()) { hashCalcer.Start(); for (auto&& i : reader) { @@ -47,7 +47,7 @@ void TBloomIndexMeta::DoFillIndexCheckers(const std::shared_ptr hashes; for (ui32 i = 0; i < HashesCount; ++i) { - NArrow::NHash::NXX64::TStreamStringHashCalcer calcer(i); + NArrow::NHash::NXX64::TStreamStringHashCalcer_H3 calcer(i); calcer.Start(); for (auto&& i : foundColumns) { NArrow::NHash::TXX64::AppendField(i.second, calcer); diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp index 4728b269baa4..8511a7914a8a 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom_ngramm/meta.cpp @@ -40,7 +40,7 @@ class TNGrammBuilder { class THashesCountSelector { public: template - static void BuildHashes(const ui8* data, const TActor& actor) { + static void BuildHashes(const ui8* data, TActor& actor) { ui64 hash = (ui64)2166136261 * (ui64)HashIdx; actor(THashesBuilder::Build(data, hash)); THashesCountSelector::BuildHashes(data, actor); @@ -51,7 +51,7 @@ class TNGrammBuilder { class THashesCountSelector<0, CharsCount> { public: template - static void BuildHashes(const ui8* /*data*/, const TActor& /*actor*/) { + static void BuildHashes(const ui8* /*data*/, TActor& /*actor*/) { } }; @@ -60,7 +60,7 @@ class TNGrammBuilder { private: template static void BuildHashesImpl( - const ui8* data, const ui32 dataSize, const std::optional op, const TActor& actor) { + const ui8* data, const ui32 dataSize, const std::optional op, TActor& actor) { TBuffer fakeString; if (!op || op == NRequest::TLikePart::EOperation::StartsWith) { for (ui32 c = 1; c <= CharsCount; ++c) { @@ -90,7 +90,7 @@ class TNGrammBuilder { public: template static void BuildHashes(const ui8* data, const ui32 dataSize, const ui32 hashesCount, const ui32 nGrammSize, - const std::optional op, const TActor& actor) { + const std::optional op, TActor& actor) { if (HashesCount == hashesCount && CharsCount == nGrammSize) { BuildHashesImpl(data, dataSize, op, actor); } else if (HashesCount > hashesCount && CharsCount > nGrammSize) { @@ -105,13 +105,12 @@ class TNGrammBuilder { } }; - template class THashesSelector<0, CharsCount> { public: template static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, - const std::optional /*op*/, const TActor& /*actor*/) { + const std::optional /*op*/, TActor& /*actor*/) { AFL_VERIFY(false); } }; @@ -121,7 +120,7 @@ class TNGrammBuilder { public: template static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, - const std::optional /*op*/, const TActor& /*actor*/) { + const std::optional /*op*/, TActor& /*actor*/) { AFL_VERIFY(false); } }; @@ -131,14 +130,14 @@ class TNGrammBuilder { public: template static void BuildHashes(const ui8* /*data*/, const ui32 /*dataSize*/, const ui32 /*hashesCount*/, const ui32 /*nGrammSize*/, - const std::optional /*op*/, const TActor& /*actor*/) { + const std::optional /*op*/, TActor& /*actor*/) { AFL_VERIFY(false); } }; template - void BuildNGramms(const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, - const TAction& pred) { + void BuildNGramms( + const char* data, const ui32 dataSize, const std::optional op, const ui32 nGrammSize, TAction& pred) { THashesSelector::BuildHashes( (const ui8*)data, dataSize, HashesCount, nGrammSize, op, pred); } @@ -149,7 +148,7 @@ class TNGrammBuilder { } template - void FillNGrammHashes(const ui32 nGrammSize, const std::shared_ptr& array, const TFiller& fillData) { + void FillNGrammHashes(const ui32 nGrammSize, const std::shared_ptr& array, TFiller& fillData) { AFL_VERIFY(array->type_id() == arrow::utf8()->id())("id", array->type()->ToString()); NArrow::SwitchType(array->type_id(), [&](const auto& type) { using TWrap = std::decay_t; @@ -173,22 +172,35 @@ class TNGrammBuilder { } template - void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, const TFiller& fillData) { + void FillNGrammHashes(const ui32 nGrammSize, const NRequest::TLikePart::EOperation op, const TString& userReq, TFiller& fillData) { BuildNGramms(userReq.data(), userReq.size(), op, nGrammSize, fillData); } }; +class TVectorInserter { +private: + bool* Values; + const ui32 Size; + +public: + TVectorInserter(std::vector& values) + : Values(&values[0]) + , Size(values.size()) { + } + + void operator()(const ui64 hash) { + Values[hash % Size] = true; + } +}; + TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const { AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount()); TNGrammBuilder builder(HashesCount); std::vector bitsVector(FilterSizeBytes * 8, false); - bool* memAccessor = &bitsVector[0]; - const auto predSet = [&](const ui64 hashSecondary) { - memAccessor[hashSecondary % (FilterSizeBytes * 8)] = true; - }; + TVectorInserter inserter(bitsVector); for (reader.Start(); reader.IsCorrect();) { - builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), predSet); + builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter); reader.ReadNext(reader.begin()->GetCurrentChunk()->length()); } return TFixStringBitsStorage(bitsVector).GetData(); diff --git a/ydb/library/formats/arrow/hash/xx_hash.cpp b/ydb/library/formats/arrow/hash/xx_hash.cpp index 7f13ff13658c..d6b7b654857c 100644 --- a/ydb/library/formats/arrow/hash/xx_hash.cpp +++ b/ydb/library/formats/arrow/hash/xx_hash.cpp @@ -2,16 +2,28 @@ namespace NKikimr::NArrow::NHash::NXX64 { -void TStreamStringHashCalcer::Start() { +void TStreamStringHashCalcer_H3::Start() { XXH3_64bits_reset_withSeed(&HashState, Seed); } -void TStreamStringHashCalcer::Update(const ui8* data, const ui32 size) { +void TStreamStringHashCalcer_H3::Update(const ui8* data, const ui32 size) { XXH3_64bits_update(&HashState, data, size); } -ui64 TStreamStringHashCalcer::Finish() { +ui64 TStreamStringHashCalcer_H3::Finish() { return XXH3_64bits_digest(&HashState); } +void TStreamStringHashCalcer::Start() { + XXH64_reset(&HashState, Seed); +} + +void TStreamStringHashCalcer::Update(const ui8* data, const ui32 size) { + XXH64_update(&HashState, data, size); +} + +ui64 TStreamStringHashCalcer::Finish() { + return XXH64_digest(&HashState); +} + } diff --git a/ydb/library/formats/arrow/hash/xx_hash.h b/ydb/library/formats/arrow/hash/xx_hash.h index bb82a82c49c4..d0cfed68b549 100644 --- a/ydb/library/formats/arrow/hash/xx_hash.h +++ b/ydb/library/formats/arrow/hash/xx_hash.h @@ -10,7 +10,7 @@ namespace NKikimr::NArrow::NHash::NXX64 { class TStreamStringHashCalcer { private: const ui64 Seed; - XXH3_state_t HashState; + XXH64_state_t HashState; public: TStreamStringHashCalcer(const ui64 seed) : Seed(seed) { @@ -21,4 +21,18 @@ class TStreamStringHashCalcer { ui64 Finish(); }; +class TStreamStringHashCalcer_H3 { +private: + const ui64 Seed; + XXH3_state_t HashState; +public: + TStreamStringHashCalcer_H3(const ui64 seed) + : Seed(seed) { + } + + void Start(); + void Update(const ui8* data, const ui32 size); + ui64 Finish(); +}; + } From ea59201db669d24bf9549609ce42b946cf4a8961 Mon Sep 17 00:00:00 2001 From: ivanmorozov333 Date: Thu, 26 Dec 2024 13:17:24 +0300 Subject: [PATCH 11/11] fixes --- ydb/core/formats/arrow/hash/calcer.cpp | 25 +++++++++++++++++++++++-- ydb/core/formats/arrow/hash/calcer.h | 2 ++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/ydb/core/formats/arrow/hash/calcer.cpp b/ydb/core/formats/arrow/hash/calcer.cpp index d5fa4a8dd6a3..8667f5b11107 100644 --- a/ydb/core/formats/arrow/hash/calcer.cpp +++ b/ydb/core/formats/arrow/hash/calcer.cpp @@ -9,7 +9,9 @@ namespace NKikimr::NArrow::NHash { -void TXX64::AppendField(const std::shared_ptr& scalar, NXX64::TStreamStringHashCalcer& hashCalcer) { +namespace { +template +void AppendFieldImpl(const std::shared_ptr& scalar, TStreamCalcer& hashCalcer) { AFL_VERIFY(scalar); NArrow::SwitchType(scalar->type->id(), [&](const auto& type) { using TWrap = std::decay_t; @@ -28,7 +30,8 @@ void TXX64::AppendField(const std::shared_ptr& scalar, NXX64::TSt }); } -void TXX64::AppendField(const std::shared_ptr& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer& hashCalcer) { +template +void AppendFieldImpl(const std::shared_ptr& array, const int row, TStreamCalcer& hashCalcer) { NArrow::SwitchType(array->type_id(), [&](const auto& type) { using TWrap = std::decay_t; using T = typename TWrap::T; @@ -49,6 +52,24 @@ void TXX64::AppendField(const std::shared_ptr& array, const int ro }); } +} // namespace + +void TXX64::AppendField(const std::shared_ptr& scalar, NXX64::TStreamStringHashCalcer& hashCalcer) { + AppendFieldImpl(scalar, hashCalcer); +} + +void TXX64::AppendField(const std::shared_ptr& scalar, NXX64::TStreamStringHashCalcer_H3& hashCalcer) { + AppendFieldImpl(scalar, hashCalcer); +} + +void TXX64::AppendField(const std::shared_ptr& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer& hashCalcer) { + AppendFieldImpl(array, row, hashCalcer); +} + +void TXX64::AppendField(const std::shared_ptr& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer_H3& hashCalcer) { + AppendFieldImpl(array, row, hashCalcer); +} + std::optional> TXX64::Execute(const std::shared_ptr& batch) const { std::vector> columns = GetColumns(batch); if (columns.empty()) { diff --git a/ydb/core/formats/arrow/hash/calcer.h b/ydb/core/formats/arrow/hash/calcer.h index 51dfe7858f8c..490a0e05e366 100644 --- a/ydb/core/formats/arrow/hash/calcer.h +++ b/ydb/core/formats/arrow/hash/calcer.h @@ -67,6 +67,8 @@ class TXX64 { static void AppendField(const std::shared_ptr& array, const int row, NXX64::TStreamStringHashCalcer& hashCalcer); static void AppendField(const std::shared_ptr& scalar, NXX64::TStreamStringHashCalcer& hashCalcer); + static void AppendField(const std::shared_ptr& array, const int row, NXX64::TStreamStringHashCalcer_H3& hashCalcer); + static void AppendField(const std::shared_ptr& scalar, NXX64::TStreamStringHashCalcer_H3& hashCalcer); static ui64 CalcHash(const std::shared_ptr& scalar); std::optional> Execute(const std::shared_ptr& batch) const;