From f6eb5c39e8ec2f2eb4be1c7c46fec7b9f02d4baa Mon Sep 17 00:00:00 2001 From: Wish Date: Wed, 8 Jan 2025 07:15:52 +0800 Subject: [PATCH 1/4] storage: Refine local index for more kinds Signed-off-by: Wish --- .../DeltaMerge/ColumnFile/ColumnFileTiny.cpp | 88 ++++++++-- .../DeltaMerge/ColumnFile/ColumnFileTiny.h | 19 +-- ...cpp => ColumnFileTinyLocalIndexWriter.cpp} | 112 ++++++++----- ...ter.h => ColumnFileTinyLocalIndexWriter.h} | 6 +- .../ColumnFileTinyVectorIndexReader.cpp | 11 +- .../ColumnFileTinyVectorIndexReader.h | 2 +- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 7 +- .../DeltaMergeStore_InternalSegment.cpp | 16 +- .../DeltaMerge/DeltaMergeStore_Statistics.cpp | 2 +- .../src/Storages/DeltaMerge/File/ColumnStat.h | 74 +++++++-- dbms/src/Storages/DeltaMerge/File/DMFile.h | 23 ++- .../DeltaMerge/File/DMFileBlockInputStream.h | 2 +- ...xWriter.cpp => DMFileLocalIndexWriter.cpp} | 156 +++++++++++------- ...IndexWriter.h => DMFileLocalIndexWriter.h} | 4 +- .../src/Storages/DeltaMerge/File/DMFileMeta.h | 6 +- .../Storages/DeltaMerge/File/DMFileMetaV2.cpp | 29 ++-- .../Storages/DeltaMerge/File/DMFileMetaV2.h | 2 +- .../File/DMFileVectorIndexReader.cpp | 11 +- .../DeltaMerge/File/DMFileVectorIndexReader.h | 1 + .../Storages/DeltaMerge/File/DMFileWriter.cpp | 2 +- .../DeltaMerge/Index/LocalIndexInfo.cpp | 22 +-- .../DeltaMerge/Index/LocalIndexInfo.h | 32 ++-- .../DeltaMerge/Index/LocalIndexInfo_fwd.h | 30 ++++ .../Storages/DeltaMerge/Index/VectorIndex.cpp | 38 +---- .../Storages/DeltaMerge/Index/VectorIndex.h | 16 +- .../DeltaMerge/Index/VectorIndexCache.h | 1 + .../DeltaMerge/Index/VectorIndexCache_fwd.h | 25 +++ .../Index/VectorIndexHNSW/Index.cpp | 14 +- .../DeltaMerge/Index/VectorIndexHNSW/Index.h | 8 +- .../DeltaMerge/Index/VectorIndex_fwd.h | 3 - .../DeltaMerge/Remote/Proto/remote.proto | 22 +-- .../Storages/DeltaMerge/Remote/Serializer.cpp | 30 +--- .../Storages/DeltaMerge/SegmentReadTask.cpp | 11 +- .../DeltaMerge/dtpb/column_file.proto | 17 +- .../src/Storages/DeltaMerge/dtpb/dmfile.proto | 17 +- .../Storages/DeltaMerge/dtpb/index_file.proto | 44 +++++ .../dtpb/index_file_deprecated.proto | 34 ++++ .../tests/gtest_column_file_clone.cpp | 32 ++-- ...test_dm_delta_merge_store_vector_index.cpp | 16 +- .../tests/gtest_dm_vector_index.cpp | 38 ++--- .../tests/gtest_dm_vector_index_utils.h | 5 +- .../tests/gtest_local_index_info.cpp | 54 +++--- .../tests/gtest_segment_test_basic.cpp | 13 +- dbms/src/TiDB/Schema/TiDB.cpp | 2 +- dbms/src/TiDB/Schema/TiDB.h | 21 +++ 45 files changed, 706 insertions(+), 412 deletions(-) rename dbms/src/Storages/DeltaMerge/ColumnFile/{ColumnFileTinyVectorIndexWriter.cpp => ColumnFileTinyLocalIndexWriter.cpp} (65%) rename dbms/src/Storages/DeltaMerge/ColumnFile/{ColumnFileTinyVectorIndexWriter.h => ColumnFileTinyLocalIndexWriter.h} (92%) rename dbms/src/Storages/DeltaMerge/File/{DMFileVectorIndexWriter.cpp => DMFileLocalIndexWriter.cpp} (68%) rename dbms/src/Storages/DeltaMerge/File/{DMFileVectorIndexWriter.h => DMFileLocalIndexWriter.h} (96%) create mode 100644 dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo_fwd.h create mode 100644 dbms/src/Storages/DeltaMerge/Index/VectorIndexCache_fwd.h create mode 100644 dbms/src/Storages/DeltaMerge/dtpb/index_file.proto create mode 100644 dbms/src/Storages/DeltaMerge/dtpb/index_file_deprecated.proto diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp index 185cdf79fce..746ee6325d4 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp @@ -28,6 +28,45 @@ namespace DB::DM { +namespace details +{ + +inline dtpb::ColumnFileIndexInfo migrateFromIndexInfoV1(const dtpb::ColumnFileIndexInfo & index_pb) +{ + RUNTIME_CHECK(index_pb.has_deprecated_vector_index()); + + auto idx_info = dtpb::ColumnFileIndexInfo{}; + idx_info.set_index_page_id(index_pb.index_page_id()); + auto * idx_props = idx_info.mutable_index_props(); + idx_props->set_kind(dtpb::IndexFileKind::VECTOR_INDEX); + idx_props->set_index_id(index_pb.deprecated_vector_index().index_id()); + idx_props->set_file_size(index_pb.deprecated_vector_index().index_bytes()); + auto * vec_idx = idx_props->mutable_vector_index(); + vec_idx->set_format_version(0); + vec_idx->set_dimensions(index_pb.deprecated_vector_index().dimensions()); + vec_idx->set_distance_metric(index_pb.deprecated_vector_index().distance_metric()); + return idx_info; +} + +inline void integrityCheckIndexInfoV2(const dtpb::ColumnFileIndexInfo & index_info) +{ + RUNTIME_CHECK(index_info.has_index_page_id()); + RUNTIME_CHECK(index_info.index_props().has_file_size()); + RUNTIME_CHECK(index_info.index_props().has_index_id()); + RUNTIME_CHECK(index_info.index_props().has_kind()); + switch (index_info.index_props().kind()) + { + case dtpb::IndexFileKind::VECTOR_INDEX: + RUNTIME_CHECK(index_info.index_props().has_vector_index()); + break; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index_info.index_props().kind())); + } +} + +} // namespace details + + ColumnFileReaderPtr ColumnFileTiny::getReader( const DMContext &, const IColumnFileDataProviderPtr & data_provider, @@ -61,10 +100,12 @@ void ColumnFileTiny::serializeMetadata(dtpb::ColumnFilePersisted * cf_pb, bool s for (const auto & index_info : *index_infos) { + // Just some integrity checks to ensure we are writing correct data. + // These data may come from deserialization, or generated in runtime. + details::integrityCheckIndexInfoV2(index_info); + auto * index_pb = tiny_pb->add_indexes(); - index_pb->set_index_page_id(index_info.index_page_id); - if (index_info.vector_index.has_value()) - index_pb->mutable_vector_index()->CopyFrom(*index_info.vector_index); + index_pb->CopyFrom(index_info); } } @@ -121,10 +162,16 @@ ColumnFilePersistedPtr ColumnFileTiny::deserializeMetadata( index_infos->reserve(cf_pb.indexes().size()); for (const auto & index_pb : cf_pb.indexes()) { - if (index_pb.has_vector_index()) - index_infos->emplace_back(index_pb.index_page_id(), index_pb.vector_index()); - else - index_infos->emplace_back(index_pb.index_page_id(), std::nullopt); + // Old format compatibility + if unlikely (index_pb.has_deprecated_vector_index()) + { + auto idx_info = details::migrateFromIndexInfoV1(index_pb); + index_infos->emplace_back(std::move(idx_info)); + continue; + } + + details::integrityCheckIndexInfoV2(index_pb); + index_infos->emplace_back(index_pb); } return std::make_shared(schema, rows, bytes, data_page_id, context, index_infos); @@ -178,13 +225,13 @@ ColumnFilePersistedPtr ColumnFileTiny::restoreFromCheckpoint( // Write index data page to local ps auto new_index_infos = std::make_shared(); - for (const auto & index : *index_infos) + for (const auto & index_pb : *index_infos) { - auto new_index_page_id = put_remote_page(index.index_page_id); - if (index.vector_index) - new_index_infos->emplace_back(new_index_page_id, index.vector_index); - else - new_index_infos->emplace_back(new_index_page_id, std::nullopt); + details::integrityCheckIndexInfoV2(index_pb); + auto new_index_page_id = put_remote_page(index_pb.index_page_id()); + auto new_index_pb = index_pb; + new_index_pb.set_index_page_id(new_index_page_id); + new_index_infos->emplace_back(std::move(index_pb)); } return std::make_shared(column_file_schema, rows, bytes, new_cf_id, context, new_index_infos); } @@ -235,10 +282,15 @@ std::tuple ColumnFileTiny::createFromCheckpoin index_infos->reserve(cf_pb.indexes().size()); for (const auto & index_pb : cf_pb.indexes()) { - if (index_pb.has_vector_index()) - index_infos->emplace_back(index_pb.index_page_id(), index_pb.vector_index()); - else - index_infos->emplace_back(index_pb.index_page_id(), std::nullopt); + // Old format compatibility + if unlikely (index_pb.has_deprecated_vector_index()) + { + auto idx_info = details::migrateFromIndexInfoV1(index_pb); + index_infos->emplace_back(std::move(idx_info)); + continue; + } + + index_infos->emplace_back(index_pb); } return { @@ -343,7 +395,7 @@ void ColumnFileTiny::removeData(WriteBatches & wbs) const if (index_infos) { for (const auto & index_info : *index_infos) - wbs.removed_log.delPage(index_info.index_page_id); + wbs.removed_log.delPage(index_info.index_page_id()); } } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h index 48b1c161bd2..c1a4a6ce090 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h @@ -20,7 +20,6 @@ #include #include #include -#include #include namespace DB::DM @@ -37,21 +36,11 @@ class ColumnFileTiny : public ColumnFilePersisted { public: friend class ColumnFileTinyReader; - friend class ColumnFileTinyVectorIndexWriter; + friend class ColumnFileTinyLocalIndexWriter; friend class ColumnFileTinyVectorIndexReader; friend struct Remote::Serializer; - struct IndexInfo - { - IndexInfo(PageIdU64 page_id, std::optional vec_index) - : index_page_id(page_id) - , vector_index(vec_index) - {} - - PageIdU64 index_page_id{}; - std::optional vector_index = std::nullopt; - }; - using IndexInfos = std::vector; + using IndexInfos = std::vector; using IndexInfosPtr = std::shared_ptr; private: @@ -99,9 +88,7 @@ class ColumnFileTiny : public ColumnFilePersisted if (!index_infos) return false; return std::any_of(index_infos->cbegin(), index_infos->cend(), [index_id](const auto & info) { - if (!info.vector_index) - return false; - return info.vector_index->index_id() == index_id; + return info.index_props().index_id() == index_id; }); } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.cpp similarity index 65% rename from dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.cpp rename to dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.cpp index 5903092652f..21dd1260955 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.cpp @@ -14,8 +14,8 @@ #include #include +#include #include -#include #include #include @@ -28,7 +28,7 @@ extern const int ABORTED; namespace DB::DM { -ColumnFileTinyVectorIndexWriter::LocalIndexBuildInfo ColumnFileTinyVectorIndexWriter::getLocalIndexBuildInfo( +ColumnFileTinyLocalIndexWriter::LocalIndexBuildInfo ColumnFileTinyLocalIndexWriter::getLocalIndexBuildInfo( const LocalIndexInfosSnapshot & index_infos, const ColumnFilePersistedSetPtr & file_set) { @@ -78,7 +78,7 @@ ColumnFileTinyVectorIndexWriter::LocalIndexBuildInfo ColumnFileTinyVectorIndexWr return build; } -ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( +ColumnFileTinyPtr ColumnFileTinyLocalIndexWriter::buildIndexForFile( const ColumnDefines & column_defines, const ColumnDefine & del_cd, const ColumnFileTiny * file, @@ -91,17 +91,27 @@ ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( read_columns->reserve(options.index_infos->size() + 1); read_columns->push_back(del_cd); - std::unordered_map> index_builders; + struct IndexToBuild + { + LocalIndexInfo info; + VectorIndexBuilderPtr builder_vector; + }; + + std::unordered_map> index_builders; - std::unordered_map> col_indexes; for (const auto & index_info : *options.index_infos) { - if (index_info.type != IndexType::Vector) + // Just skip if the index is already built + if (file->hasIndex(index_info.index_id)) continue; - col_indexes[index_info.column_id].emplace_back(index_info); + RUNTIME_CHECK(index_info.def_vector_index != nullptr); + index_builders[index_info.column_id].emplace_back(IndexToBuild{ + .info = index_info, + .builder_vector = {}, + }); } - for (const auto & [col_id, index_infos] : col_indexes) + for (auto & [col_id, indexes] : index_builders) { // Make sure the column_id is in the schema. const auto cd_iter = std::find_if( // @@ -114,14 +124,17 @@ ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( col_id, file->getDataPageId()); - for (const auto & idx_info : index_infos) + for (auto & index : indexes) { - // Just skip if the index is already built - if (file->hasIndex(idx_info.index_id)) - continue; - - index_builders[col_id].emplace_back( - VectorIndexBuilder::create(idx_info.index_id, idx_info.index_definition)); + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: + index.builder_vector = VectorIndexBuilder::create(index.info.def_vector_index); + break; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } } read_columns->push_back(*cd_iter); } @@ -156,9 +169,18 @@ ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( const auto & col_with_type_and_name = block.safeGetByPosition(col_idx); RUNTIME_CHECK(col_with_type_and_name.column_id == read_columns->at(col_idx).id); const auto & col = col_with_type_and_name.column; - for (const auto & index_builder : index_builders[read_columns->at(col_idx).id]) + for (const auto & index : index_builders[read_columns->at(col_idx).id]) { - index_builder->addBlock(*col, del_mark, should_proceed); + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: + RUNTIME_CHECK(index.builder_vector); + index.builder_vector->addBlock(*col, del_mark, should_proceed); + break; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } } } } @@ -168,26 +190,42 @@ ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( for (size_t col_idx = 1; col_idx < num_cols; ++col_idx) { const auto & cd = read_columns->at(col_idx); - for (const auto & index_builder : index_builders[cd.id]) + for (const auto & index : index_builders[cd.id]) { - auto index_page_id = options.storage_pool->newLogPageId(); - MemoryWriteBuffer write_buf; - CompressedWriteBuffer compressed(write_buf); - index_builder->saveToBuffer(compressed); - compressed.next(); - auto data_size = write_buf.count(); - auto buf = write_buf.tryGetReadBuffer(); - // ColumnFileDataProviderRNLocalPageCache currently does not support read data with fields - options.wbs.log.putPage(index_page_id, 0, buf, data_size, {data_size}); - - dtpb::VectorIndexFileProps vector_index; - vector_index.set_index_id(index_builder->index_id); - vector_index.set_index_bytes(data_size); - vector_index.set_index_kind(tipb::VectorIndexKind_Name(index_builder->definition->kind)); - vector_index.set_distance_metric( - tipb::VectorDistanceMetric_Name(index_builder->definition->distance_metric)); - vector_index.set_dimensions(index_builder->definition->dimension); - index_infos->emplace_back(index_page_id, vector_index); + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: + { + RUNTIME_CHECK(index.builder_vector); + auto index_page_id = options.storage_pool->newLogPageId(); + MemoryWriteBuffer write_buf; + CompressedWriteBuffer compressed(write_buf); + index.builder_vector->saveToBuffer(compressed); + compressed.next(); + auto data_size = write_buf.count(); + auto buf = write_buf.tryGetReadBuffer(); + // ColumnFileDataProviderRNLocalPageCache currently does not support read data with fields + options.wbs.log.putPage(index_page_id, 0, buf, data_size, {data_size}); + + auto idx_info = dtpb::ColumnFileIndexInfo{}; + idx_info.set_index_page_id(index_page_id); + auto * idx_props = idx_info.mutable_index_props(); + idx_props->set_kind(dtpb::IndexFileKind::VECTOR_INDEX); + idx_props->set_index_id(index.info.index_id); + idx_props->set_file_size(data_size); + auto * vector_index = idx_props->mutable_vector_index(); + vector_index->set_format_version(0); + vector_index->set_dimensions(index.info.def_vector_index->dimension); + vector_index->set_distance_metric( + tipb::VectorDistanceMetric_Name(index.info.def_vector_index->distance_metric)); + index_infos->emplace_back(std::move(idx_info)); + + break; + } + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } } } @@ -200,7 +238,7 @@ ColumnFileTinyPtr ColumnFileTinyVectorIndexWriter::buildIndexForFile( return file->cloneWith(file->getDataPageId(), index_infos); } -ColumnFileTinys ColumnFileTinyVectorIndexWriter::build(ProceedCheckFn should_proceed) const +ColumnFileTinys ColumnFileTinyLocalIndexWriter::build(ProceedCheckFn should_proceed) const { ColumnFileTinys new_files; new_files.reserve(options.files.size()); diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.h similarity index 92% rename from dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.h rename to dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.h index 70e92e1e22e..6a537f5299e 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexWriter.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyLocalIndexWriter.h @@ -28,8 +28,8 @@ namespace DB::DM using ColumnFileTinys = std::vector; -// ColumnFileTinyVectorIndexWriter write vector index store in PageStorage for ColumnFileTiny. -class ColumnFileTinyVectorIndexWriter +// ColumnFileTinyLocalIndexWriter write vector index store in PageStorage for ColumnFileTiny. +class ColumnFileTinyLocalIndexWriter { public: struct LocalIndexBuildInfo @@ -54,7 +54,7 @@ class ColumnFileTinyVectorIndexWriter WriteBatches & wbs; // Write index and modify meta in the same batch. }; - explicit ColumnFileTinyVectorIndexWriter(const Options & options) + explicit ColumnFileTinyLocalIndexWriter(const Options & options) : logger(Logger::get()) , options(options) {} diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.cpp index 59d96035d5c..47066a06688 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.cpp @@ -69,23 +69,20 @@ void ColumnFileTinyVectorIndexReader::loadVectorIndex() auto index_id = ann_query_info->index_id(); const auto index_info_iter = std::find_if(index_infos->cbegin(), index_infos->cend(), [index_id](const auto & info) { - if (!info.vector_index) - return false; - return info.vector_index->index_id() == index_id; + return info.index_props().index_id() == index_id; }); if (index_info_iter == index_infos->cend()) return; - auto vector_index = index_info_iter->vector_index; - if (!vector_index) + if (!index_info_iter->index_props().has_vector_index()) return; - auto index_page_id = index_info_iter->index_page_id; + auto index_page_id = index_info_iter->index_page_id(); auto load_from_page_storage = [&]() { perf_stat.load_from_cache = false; std::vector index_fields = {0}; auto index_page = data_provider->readTinyData(index_page_id, index_fields); ReadBufferFromOwnString read_buf(index_page.data); CompressedReadBuffer compressed(read_buf); - return VectorIndexViewer::load(*vector_index, compressed); + return VectorIndexViewer::load(index_info_iter->index_props().vector_index(), compressed); }; if (vec_index_cache) { diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.h index cb6c09c6576..804e4fecb04 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTinyVectorIndexReader.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp index e062f3bd971..109b2b0bc6a 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp @@ -155,8 +155,11 @@ std::vector CloneColumnFilesHelper::clone( for (auto & index_info : *index_infos) { auto new_index_page_id = dm_context.storage_pool->newLogPageId(); - wbs.log.putRefPage(new_index_page_id, index_info.index_page_id); - new_index_infos->emplace_back(new_index_page_id, index_info.vector_index); + wbs.log.putRefPage(new_index_page_id, index_info.index_page_id()); + auto new_index_info = dtpb::ColumnFileIndexInfo{}; + new_index_info.set_index_page_id(new_index_page_id); + new_index_info.mutable_index_props()->CopyFrom(index_info.index_props()); + new_index_infos->emplace_back(std::move(new_index_info)); } auto new_column_file = t->cloneWith(new_data_page_id, new_index_infos); cloned.push_back(new_column_file); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 4592e85d365..dc64510c486 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -18,10 +18,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include #include @@ -542,7 +542,7 @@ bool DeltaMergeStore::segmentEnsureStableLocalIndexAsync(const SegmentPtr & segm // No lock is needed, stable meta is immutable. const auto build_info - = DMFileVectorIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, segment->getStable()->getDMFiles()); + = DMFileLocalIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, segment->getStable()->getDMFiles()); if (!build_info.indexes_to_build || build_info.indexes_to_build->empty() || build_info.dm_files.empty()) return false; @@ -616,7 +616,7 @@ bool DeltaMergeStore::segmentWaitStableLocalIndexReady(const SegmentPtr & segmen // No lock is needed, stable meta is immutable. auto segment_id = segment->segmentId(); auto build_info - = DMFileVectorIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, segment->getStable()->getDMFiles()); + = DMFileLocalIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, segment->getStable()->getDMFiles()); if (!build_info.indexes_to_build || build_info.indexes_to_build->empty()) return true; @@ -737,7 +737,7 @@ void DeltaMergeStore::segmentEnsureStableLocalIndex( DMFile::info(index_build_info.dm_files)); // 2. Build the index. - DMFileVectorIndexWriter iw(DMFileVectorIndexWriter::Options{ + DMFileLocalIndexWriter iw(DMFileLocalIndexWriter::Options{ .path_pool = path_pool, .index_infos = index_build_info.indexes_to_build, .dm_files = index_build_info.dm_files, @@ -864,7 +864,7 @@ bool DeltaMergeStore::segmentEnsureDeltaLocalIndexAsync(const SegmentPtr & segme if (!column_file_persisted_set) return false; auto build_info - = ColumnFileTinyVectorIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, column_file_persisted_set); + = ColumnFileTinyLocalIndexWriter::getLocalIndexBuildInfo(local_index_infos_snap, column_file_persisted_set); if (!build_info.indexes_to_build || build_info.indexes_to_build->empty()) return false; // Use weak_ptr to avoid blocking gc. @@ -940,7 +940,7 @@ bool DeltaMergeStore::segmentWaitDeltaLocalIndexReady(const SegmentPtr & segment if (!column_file_persisted_set) return false; auto build_info - = ColumnFileTinyVectorIndexWriter::getLocalIndexBuildInfo(local_index_infos, column_file_persisted_set); + = ColumnFileTinyLocalIndexWriter::getLocalIndexBuildInfo(local_index_infos, column_file_persisted_set); // Use weak_ptr to avoid blocking gc. auto delta_weak_ptr = std::weak_ptr(segment->getDelta()); lock.unlock(); @@ -1029,7 +1029,7 @@ void DeltaMergeStore::segmentEnsureDeltaLocalIndex( // 2. Build the index. WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); - ColumnFileTinyVectorIndexWriter iw(ColumnFileTinyVectorIndexWriter::Options{ + ColumnFileTinyLocalIndexWriter iw(ColumnFileTinyLocalIndexWriter::Options{ .storage_pool = storage_pool, .write_limiter = dm_context.getWriteLimiter(), .files = persisted_files, diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Statistics.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Statistics.cpp index 49ba1e2a43e..9dc610e1ffa 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Statistics.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Statistics.cpp @@ -225,7 +225,7 @@ LocalIndexesStats DeltaMergeStore::getLocalIndexStats() LocalIndexStats index_stats; index_stats.column_id = index_info.column_id; index_stats.index_id = index_info.index_id; - index_stats.index_kind = tipb::VectorIndexKind_Name(index_info.index_definition->kind); + index_stats.index_kind = magic_enum::enum_name(index_info.kind); // like Vector for (const auto & [handle, segment] : segments) { diff --git a/dbms/src/Storages/DeltaMerge/File/ColumnStat.h b/dbms/src/Storages/DeltaMerge/File/ColumnStat.h index a6a6cd8fadf..4f078ecebbf 100644 --- a/dbms/src/Storages/DeltaMerge/File/ColumnStat.h +++ b/dbms/src/Storages/DeltaMerge/File/ColumnStat.h @@ -42,7 +42,7 @@ struct ColumnStat size_t sizes_bytes = 0; // Array sizes or String sizes, depends on the data type of this column size_t sizes_mark_bytes = 0; - std::vector vector_index; + std::vector indexes{}; #ifndef NDEBUG // This field is only used for testing @@ -64,10 +64,11 @@ struct ColumnStat stat.set_sizes_bytes(sizes_bytes); stat.set_sizes_mark_bytes(sizes_mark_bytes); - for (const auto & vec_idx : vector_index) + for (const auto & idx : indexes) { - auto * pb_idx = stat.add_vector_indexes(); - pb_idx->CopyFrom(vec_idx); + integrityCheckIndexInfoV2(idx); + auto * pb_idx = stat.add_indexes(); + pb_idx->CopyFrom(idx); } #ifndef NDEBUG @@ -91,19 +92,43 @@ struct ColumnStat sizes_bytes = proto.sizes_bytes(); sizes_mark_bytes = proto.sizes_mark_bytes(); - if (proto.has_vector_index()) + // Backward compatibility: There is a `vector_index` field. + if unlikely (proto.has_deprecated_vector_index()) { - // For backward compatibility, loaded `vector_index` into `vector_indexes` - // with index_id == EmptyIndexID - vector_index.emplace_back(proto.vector_index()); - auto & idx = vector_index.back(); - idx.set_index_id(EmptyIndexID); - idx.set_index_bytes(index_bytes); + auto idx = dtpb::DMFileIndexInfo{}; + auto * idx_props = idx.mutable_index_props(); + idx_props->set_kind(dtpb::IndexFileKind::VECTOR_INDEX); + idx_props->set_index_id(EmptyIndexID); + idx_props->set_file_size(index_bytes); + auto * vector_idx_props = idx_props->mutable_vector_index(); + vector_idx_props->set_format_version(0); + vector_idx_props->set_dimensions(proto.deprecated_vector_index().dimensions()); + vector_idx_props->set_distance_metric(proto.deprecated_vector_index().distance_metric()); } - vector_index.reserve(vector_index.size() + proto.vector_indexes_size()); - for (const auto & pb_idx : proto.vector_indexes()) + + // Backward compatibility: There is a `vector_indexes` field. + if unlikely (proto.deprecated_vector_indexes_size() > 0) + { + for (const auto & old_pb_idx : proto.deprecated_vector_indexes()) + { + auto idx = dtpb::DMFileIndexInfo{}; + auto * idx_props = idx.mutable_index_props(); + idx_props->set_kind(dtpb::IndexFileKind::VECTOR_INDEX); + idx_props->set_index_id(old_pb_idx.index_id()); + idx_props->set_file_size(old_pb_idx.index_bytes()); + auto * vector_idx_props = idx_props->mutable_vector_index(); + vector_idx_props->set_format_version(0); + vector_idx_props->set_dimensions(old_pb_idx.dimensions()); + vector_idx_props->set_distance_metric(old_pb_idx.distance_metric()); + + indexes.emplace_back(std::move(idx)); + } + } + + for (const auto & pb_idx : proto.indexes()) { - vector_index.emplace_back(pb_idx); + integrityCheckIndexInfoV2(pb_idx); + indexes.emplace_back(pb_idx); } #ifndef NDEBUG @@ -144,6 +169,25 @@ struct ColumnStat readIntBinary(nullmap_mark_bytes, buf); readIntBinary(index_bytes, buf); } + +private: + static void integrityCheckIndexInfoV2(const dtpb::DMFileIndexInfo & index_info) + { + RUNTIME_CHECK(index_info.index_props().has_file_size()); + RUNTIME_CHECK(index_info.index_props().has_index_id()); + RUNTIME_CHECK(index_info.index_props().has_kind()); + switch (index_info.index_props().kind()) + { + case dtpb::IndexFileKind::VECTOR_INDEX: + RUNTIME_CHECK(index_info.index_props().has_vector_index()); + break; + default: + RUNTIME_CHECK_MSG( + false, + "Unsupported index kind: {}", + magic_enum::enum_name(index_info.index_props().kind())); + } + } }; using ColumnStats = std::unordered_map; @@ -186,7 +230,7 @@ readText(ColumnStats & column_sats, DMFileFormat::Version ver, ReadBuffer & buf) .avg_size = avg_size, .serialized_bytes = serialized_bytes, // ... here ignore some fields with default initializers - .vector_index = {}, + .indexes = {}, #ifndef NDEBUG .additional_data_for_test = {}, #endif diff --git a/dbms/src/Storages/DeltaMerge/File/DMFile.h b/dbms/src/Storages/DeltaMerge/File/DMFile.h index d6751f2f0d9..fb9691277b6 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFile.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFile.h @@ -25,6 +25,7 @@ #include #include #include +#include #include namespace DTTool::Migrate @@ -165,7 +166,7 @@ class DMFile : private boost::noncopyable // Return std::nullopt if // - the col_id is not exist in the dmfile // - the index has not been built - std::optional getLocalIndex(ColId col_id, IndexID index_id) const + std::optional getLocalIndex(ColId col_id, IndexID index_id) const { return meta->getLocalIndex(col_id, index_id); } @@ -305,8 +306,22 @@ class DMFile : private boost::noncopyable return IDataType::getFileNameForStream(DB::toString(col_id), substream); } - static String vectorIndexFileName(IndexID index_id) { return fmt::format("idx_{}.vector", index_id); } - String vectorIndexPath(IndexID index_id) const { return subFilePath(vectorIndexFileName(index_id)); } + static String localIndexFileName(IndexID index_id, TiDB::ColumnarIndexKind kind) + { + // Note: Keep sync with FileCache::getFileType() + switch (kind) + { + case TiDB::ColumnarIndexKind::Vector: + return fmt::format("idx_{}.vector", index_id); + default: + throw Exception(fmt::format("Unsupported index kind: {}", magic_enum::enum_name(kind))); + } + } + + String localIndexPath(IndexID index_id, TiDB::ColumnarIndexKind kind) const + { + return subFilePath(localIndexFileName(index_id, kind)); + } void addPack(const DMFileMeta::PackStat & pack_stat) const { meta->pack_stats.push_back(pack_stat); } @@ -332,7 +347,7 @@ class DMFile : private boost::noncopyable friend class DMFileVectorIndexReader; friend class DMFileV3IncrementWriter; friend class DMFileWriter; - friend class DMFileVectorIndexWriter; + friend class DMFileLocalIndexWriter; friend class DMFileReader; friend class MarkLoader; friend class ColumnReadStream; diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileBlockInputStream.h b/dbms/src/Storages/DeltaMerge/File/DMFileBlockInputStream.h index e47803c25fd..26b35569da1 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileBlockInputStream.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileBlockInputStream.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.cpp similarity index 68% rename from dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp rename to dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.cpp index 3b69242cd8e..b726a47a998 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.cpp @@ -17,8 +17,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -40,7 +40,7 @@ extern const char exception_build_local_index_for_file[]; namespace DB::DM { -LocalIndexBuildInfo DMFileVectorIndexWriter::getLocalIndexBuildInfo( +LocalIndexBuildInfo DMFileLocalIndexWriter::getLocalIndexBuildInfo( const LocalIndexInfosSnapshot & index_infos, const DMFiles & dm_files) { @@ -85,9 +85,17 @@ LocalIndexBuildInfo DMFileVectorIndexWriter::getLocalIndexBuildInfo( return build; } -size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_mutable, ProceedCheckFn should_proceed) - const +size_t DMFileLocalIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_mutable, ProceedCheckFn should_proceed) const { + DMFileV3IncrementWriter::Options iw_options{ + .dm_file = dm_file_mutable, + .file_provider = options.dm_context.global_context.getFileProvider(), + .write_limiter = options.dm_context.global_context.getWriteLimiter(), + .path_pool = options.path_pool, + .disagg_ctx = options.dm_context.global_context.getSharedContextDisagg(), + }; + auto iw = DMFileV3IncrementWriter::create(iw_options); + const auto column_defines = dm_file_mutable->getColumnDefines(); const auto del_cd_iter = std::find_if(column_defines.cbegin(), column_defines.cend(), [](const ColumnDefine & cd) { return cd.id == MutSup::delmark_col_id; @@ -103,17 +111,27 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta ColumnDefines read_columns{*del_cd_iter}; read_columns.reserve(options.index_infos->size() + 1); - std::unordered_map> index_builders; + struct IndexToBuild + { + LocalIndexInfo info; + String index_file_path; // For write out + String index_file_name; // For meta include + VectorIndexBuilderPtr builder_vector; + }; + + std::unordered_map> index_builders; - std::unordered_map> col_indexes; for (const auto & index_info : *options.index_infos) { - if (index_info.type != IndexType::Vector) - continue; - col_indexes[index_info.column_id].emplace_back(index_info); + index_builders[index_info.column_id].emplace_back(IndexToBuild{ + .info = index_info, + .index_file_path = "", + .index_file_name = "", + .builder_vector = {}, + }); } - for (const auto & [col_id, index_infos] : col_indexes) + for (auto & [col_id, indexes] : index_builders) { const auto cd_iter = std::find_if(column_defines.cbegin(), column_defines.cend(), [col_id = col_id](const auto & cd) { @@ -125,15 +143,28 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta col_id, dm_file_mutable->path()); - for (const auto & idx_info : index_infos) + for (auto & index : indexes) { + const IndexID index_id = index.info.index_id; + index.index_file_name = index_id > 0 ? dm_file_mutable->localIndexFileName(index_id, index.info.kind) + : colIndexFileName(DMFile::getFileNameBase(col_id)); + index.index_file_path = iw->localPath() + "/" + index.index_file_name; + // Index already built. We don't allow. The caller should filter away, RUNTIME_CHECK( - !dm_file_mutable->isLocalIndexExist(idx_info.column_id, idx_info.index_id), - idx_info.column_id, - idx_info.index_id); - index_builders[col_id].emplace_back( - VectorIndexBuilder::create(idx_info.index_id, idx_info.index_definition)); + !dm_file_mutable->isLocalIndexExist(index.info.column_id, index.info.index_id), + index.info.column_id, + index.info.index_id); + + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: + index.builder_vector = VectorIndexBuilder::create(index.info.def_vector_index); + break; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } } read_columns.push_back(*cd_iter); } @@ -144,15 +175,6 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta return 0; } - DMFileV3IncrementWriter::Options iw_options{ - .dm_file = dm_file_mutable, - .file_provider = options.dm_context.global_context.getFileProvider(), - .write_limiter = options.dm_context.global_context.getWriteLimiter(), - .path_pool = options.path_pool, - .disagg_ctx = options.dm_context.global_context.getSharedContextDisagg(), - }; - auto iw = DMFileV3IncrementWriter::create(iw_options); - DMFileBlockInputStreamBuilder read_stream_builder(options.dm_context.global_context); auto scan_context = std::make_shared(); @@ -187,9 +209,18 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta const auto & col_with_type_and_name = block.safeGetByPosition(col_idx); RUNTIME_CHECK(col_with_type_and_name.column_id == read_columns[col_idx].id); const auto & col = col_with_type_and_name.column; - for (const auto & index_builder : index_builders[read_columns[col_idx].id]) + for (const auto & index : index_builders[read_columns[col_idx].id]) { - index_builder->addBlock(*col, del_mark, should_proceed); + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: + RUNTIME_CHECK(index.builder_vector); + index.builder_vector->addBlock(*col, del_mark, should_proceed); + break; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } } } } @@ -198,45 +229,48 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta // Write down the index size_t total_built_index_bytes = 0; - std::unordered_map> new_indexes_on_cols; + std::unordered_map> new_indexes_on_cols; for (size_t col_idx = 1; col_idx < num_cols; ++col_idx) { const auto & cd = read_columns[col_idx]; - // Save index and update column stats - auto callback = [&](const IDataType::SubstreamPath & substream_path) -> void { - if (IDataType::isNullMap(substream_path) || IDataType::isArraySizes(substream_path) - || IDataType::isStringSizes(substream_path)) - return; - - std::vector new_indexes; - for (const auto & index_builder : index_builders[cd.id]) + + std::vector new_indexes; + + for (const auto & index : index_builders[cd.id]) + { + dtpb::DMFileIndexInfo pb_dmfile_idx{}; + auto * pb_idx = pb_dmfile_idx.mutable_index_props(); + + switch (index.info.kind) + { + case TiDB::ColumnarIndexKind::Vector: { - const IndexID index_id = index_builder->index_id; - const auto index_file_name = index_id > 0 - ? dm_file_mutable->vectorIndexFileName(index_id) - : colIndexFileName(DMFile::getFileNameBase(cd.id, substream_path)); - const auto index_path = iw->localPath() + "/" + index_file_name; - index_builder->saveToFile(index_path); - - // Memorize what kind of vector index it is, so that we can correctly restore it when reading. - dtpb::VectorIndexFileProps pb_idx; - pb_idx.set_index_kind(tipb::VectorIndexKind_Name(index_builder->definition->kind)); - pb_idx.set_distance_metric(tipb::VectorDistanceMetric_Name(index_builder->definition->distance_metric)); - pb_idx.set_dimensions(index_builder->definition->dimension); - pb_idx.set_index_id(index_id); - auto index_bytes = Poco::File(index_path).getSize(); - pb_idx.set_index_bytes(index_bytes); - new_indexes.emplace_back(std::move(pb_idx)); - - total_built_index_bytes += index_bytes; - iw->include(index_file_name); + index.builder_vector->saveToFile(index.index_file_path); + auto * pb_vec_idx = pb_idx->mutable_vector_index(); + pb_vec_idx->set_format_version(0); + pb_vec_idx->set_dimensions(index.info.def_vector_index->dimension); + pb_vec_idx->set_distance_metric( + tipb::VectorDistanceMetric_Name(index.info.def_vector_index->distance_metric)); + break; } - // Inorder to avoid concurrency reading on ColumnStat, the new added indexes - // will be insert into DMFile instance in `bumpMetaVersion`. - new_indexes_on_cols.emplace(cd.id, std::move(new_indexes)); - }; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(index.info.kind)); + break; + } + + auto index_file = Poco::File(index.index_file_path); + RUNTIME_CHECK(index_file.exists()); + pb_idx->set_kind(index.info.getKindAsDtpb()); + pb_idx->set_index_id(index.info.index_id); + pb_idx->set_file_size(index_file.getSize()); - cd.type->enumerateStreams(callback); + total_built_index_bytes += pb_idx->file_size(); + new_indexes.emplace_back(std::move(pb_dmfile_idx)); + iw->include(index.index_file_name); + } + + if (!new_indexes.empty()) + new_indexes_on_cols.emplace(cd.id, std::move(new_indexes)); } dm_file_mutable->meta->bumpMetaVersion(DMFileMetaChangeset{new_indexes_on_cols}); @@ -244,7 +278,7 @@ size_t DMFileVectorIndexWriter::buildIndexForFile(const DMFilePtr & dm_file_muta return total_built_index_bytes; } -DMFiles DMFileVectorIndexWriter::build(ProceedCheckFn should_proceed) const +DMFiles DMFileLocalIndexWriter::build(ProceedCheckFn should_proceed) const { RUNTIME_CHECK(!built); // Create a clone of existing DMFile instances by using DMFile::restore, diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.h b/dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.h similarity index 96% rename from dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.h rename to dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.h index 0d7a9d84db0..5768f74f409 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexWriter.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileLocalIndexWriter.h @@ -64,7 +64,7 @@ struct LocalIndexBuildInfo } }; -class DMFileVectorIndexWriter +class DMFileLocalIndexWriter { public: static LocalIndexBuildInfo getLocalIndexBuildInfo( @@ -81,7 +81,7 @@ class DMFileVectorIndexWriter using ProceedCheckFn = std::function; - explicit DMFileVectorIndexWriter(const Options & options) + explicit DMFileLocalIndexWriter(const Options & options) : logger(Logger::get()) , options(options) {} diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileMeta.h b/dbms/src/Storages/DeltaMerge/File/DMFileMeta.h index fc783492444..ff592a1ede8 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileMeta.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileMeta.h @@ -42,7 +42,7 @@ class DMFileV3IncrementWriter; struct DMFileMetaChangeset { - std::unordered_map> new_indexes_on_cols; + std::unordered_map> new_indexes_on_cols; }; class DMFileMeta @@ -216,9 +216,9 @@ class DMFileMeta // Return std::nullopt if // - the col_id is not exist in the dmfile // - the index has not been built - virtual std::optional getLocalIndex(ColId, IndexID) const + virtual std::optional getLocalIndex(ColId, IndexID) const { - RUNTIME_CHECK_MSG(false, "MetaV1 does not support getLocalIndexState"); + RUNTIME_CHECK_MSG(false, "MetaV1 does not support getLocalIndex"); } protected: diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp index 231809c6662..700c6bb214a 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.cpp @@ -422,12 +422,13 @@ UInt32 DMFileMetaV2::bumpMetaVersion(DMFileMetaChangeset && changeset) for (auto & [col_id, col_stat] : column_stats) { auto changed_col_iter = changeset.new_indexes_on_cols.find(col_id); - if (changed_col_iter == changeset.new_indexes_on_cols.end()) - continue; - col_stat.vector_index.insert( - col_stat.vector_index.end(), - changed_col_iter->second.begin(), - changed_col_iter->second.end()); + if (changed_col_iter != changeset.new_indexes_on_cols.end()) + { + col_stat.indexes.insert( + col_stat.indexes.end(), + changed_col_iter->second.begin(), + changed_col_iter->second.end()); + } } // bump the version @@ -445,16 +446,17 @@ std::tuple DMFileMetaV2::getLocalIndexState const auto & col_stat = it->second; bool built = std::any_of( // - col_stat.vector_index.cbegin(), - col_stat.vector_index.cend(), - [index_id](const auto & idx) { return idx.index_id() == index_id; }); + col_stat.indexes.cbegin(), + col_stat.indexes.cend(), + [index_id](const auto & idx) { return idx.index_props().index_id() == index_id; }); + if (built) return {LocalIndexState::IndexBuilt, 0}; // index is pending for build, return the column data bytes return {LocalIndexState::IndexPending, col_stat.data_bytes}; } -std::optional DMFileMetaV2::getLocalIndex(ColId col_id, IndexID index_id) const +std::optional DMFileMetaV2::getLocalIndex(ColId col_id, IndexID index_id) const { // acquire a lock on meta to ensure the atomically on col_stat.vector_index std::scoped_lock lock(mtx_bump); @@ -463,11 +465,12 @@ std::optional DMFileMetaV2::getLocalIndex(ColId col_ return std::nullopt; const auto & col_stat = it->second; - for (const auto & vec_idx : col_stat.vector_index) + for (const auto & idx : col_stat.indexes) { - if (vec_idx.index_id() == index_id) - return vec_idx; + if (idx.index_props().index_id() == index_id) + return idx; } return std::nullopt; } + } // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.h b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.h index 2fadf6d9f72..d3612c945d0 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileMetaV2.h @@ -105,7 +105,7 @@ class DMFileMetaV2 : public DMFileMeta public: std::tuple getLocalIndexState(ColId col_id, IndexID index_id) const override; - std::optional getLocalIndex(ColId col_id, IndexID index_id) const override; + std::optional getLocalIndex(ColId col_id, IndexID index_id) const override; public: UInt32 metaVersion() const override { return meta_version; } diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp index 6e1f987ddbc..a687ff3ec73 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp @@ -65,12 +65,13 @@ void DMFileVectorIndexReader::loadVectorIndex() // Check vector index exists on the column auto vector_index = dmfile->getLocalIndex(col_id, index_id); RUNTIME_CHECK(vector_index.has_value(), col_id, index_id); - perf_stat.index_size = vector_index->index_bytes(); + RUNTIME_CHECK(vector_index->index_props().kind() == dtpb::IndexFileKind::VECTOR_INDEX); + RUNTIME_CHECK(vector_index->index_props().has_vector_index()); // If local file is invalidated, cache is not valid anymore. So we // need to ensure file exists on local fs first. const auto index_file_path = index_id > 0 // - ? dmfile->vectorIndexPath(index_id) // + ? dmfile->localIndexPath(index_id, TiDB::ColumnarIndexKind::Vector) // : dmfile->colIndexPath(DMFile::getFileNameBase(col_id)); String local_index_file_path; if (auto s3_file_name = S3::S3FilenameView::fromKeyWithPrefix(index_file_path); s3_file_name.isValid()) @@ -88,7 +89,9 @@ void DMFileVectorIndexReader::loadVectorIndex() { try { - if (auto file_guard = file_cache->downloadFileForLocalRead(s3_file_name, vector_index->index_bytes()); + if (auto file_guard = file_cache->downloadFileForLocalRead( // + s3_file_name, + vector_index->index_props().file_size()); file_guard) { local_index_file_path = file_guard->getLocalFileName(); @@ -122,7 +125,7 @@ void DMFileVectorIndexReader::loadVectorIndex() auto load_from_file = [&]() { perf_stat.has_load_from_file = true; - return VectorIndexViewer::view(*vector_index, local_index_file_path); + return VectorIndexViewer::view(vector_index->index_props().vector_index(), local_index_file_path); }; Stopwatch watch; diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.h b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.h index 3c8a3630e0b..1cdfb20ae3e 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.h @@ -17,6 +17,7 @@ #include #include #include +#include #include namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp index 43300ce3cda..2cdc14bbace 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.cpp @@ -72,7 +72,7 @@ DMFileWriter::DMFileWriter( .type = cd.type, .avg_size = 0, // ... here ignore some fields with default initializers - .vector_index = {}, + .indexes = {}, #ifndef NDEBUG .additional_data_for_test = {}, #endif diff --git a/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.cpp b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.cpp index d73291e8202..019463637a6 100644 --- a/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.cpp +++ b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.cpp @@ -21,21 +21,20 @@ namespace DB::FailPoints { -extern const char force_not_support_vector_index[]; +extern const char force_not_support_local_index[]; } // namespace DB::FailPoints namespace DB::DM { -bool isVectorIndexSupported(const LoggerPtr & logger) +bool isLocalIndexSupported(const LoggerPtr & logger) { - // Vector Index requires a specific storage format to work. if ((STORAGE_FORMAT_CURRENT.identifier > 0 && STORAGE_FORMAT_CURRENT.identifier < 6) || STORAGE_FORMAT_CURRENT.identifier == 100) { LOG_ERROR( logger, - "The current storage format is {}, which does not support building vector index. TiFlash will " - "write data without vector index.", + "The current storage format is {}, which does not support building columnar index " + "like vector index or full text index. TiFlash will write data without index.", STORAGE_FORMAT_CURRENT.identifier); return false; } @@ -52,7 +51,7 @@ ColumnID getVectorIndxColumnID( return EmptyColumnID; // Vector Index requires a specific storage format to work. - if (unlikely(!isVectorIndexSupported(logger))) + if (unlikely(!isLocalIndexSupported(logger))) return EmptyColumnID; if (idx_info.idx_cols.size() != 1) @@ -98,8 +97,8 @@ LocalIndexInfosChangeset generateLocalIndexInfos( { // If the storage format does not support vector index, always return an empty // index_info. Meaning we should drop all indexes - bool is_storage_format_support = isVectorIndexSupported(logger); - fiu_do_on(FailPoints::force_not_support_vector_index, { is_storage_format_support = false; }); + bool is_storage_format_support = isLocalIndexSupported(logger); + fiu_do_on(FailPoints::force_not_support_local_index, { is_storage_format_support = false; }); if (!is_storage_format_support) return LocalIndexInfosChangeset{ .new_local_index_infos = new_index_infos, @@ -125,7 +124,7 @@ LocalIndexInfosChangeset generateLocalIndexInfos( for (const auto & idx : new_table_info.index_infos) { - if (!idx.vector_index) + if (!idx.hasColumnarIndex()) continue; const auto column_id = getVectorIndxColumnID(new_table_info, idx, logger); @@ -138,10 +137,11 @@ LocalIndexInfosChangeset generateLocalIndexInfos( { // create a new index new_index_infos->emplace_back(LocalIndexInfo{ - .type = IndexType::Vector, + .kind = idx.columnarIndexKind(), .index_id = idx.id, .column_id = column_id, - .index_definition = idx.vector_index, + // Only one of the below will be set + .def_vector_index = idx.vector_index, }); newly_added.emplace_back(idx.id); index_ids_in_new_table.emplace(idx.id); diff --git a/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.h b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.h index 68f3fd82111..d69d183835f 100644 --- a/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.h +++ b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo.h @@ -14,14 +14,16 @@ #pragma once +#include +#include +#include #include +#include #include namespace TiDB { struct TableInfo; -struct ColumnInfo; -struct IndexInfo; } // namespace TiDB namespace DB @@ -31,26 +33,28 @@ using LoggerPtr = std::shared_ptr; } // namespace DB namespace DB::DM { -enum class IndexType -{ - Vector = 1, -}; struct LocalIndexInfo { - IndexType type; + TiDB::ColumnarIndexKind kind; // If the index is defined on TiDB::ColumnInfo, use EmptyIndexID as index_id IndexID index_id = DB::EmptyIndexID; // Which column_id the index is built on ColumnID column_id = DB::EmptyColumnID; - // Now we only support vector index. - // In the future, we may support more types of indexes, using std::variant. - TiDB::VectorIndexDefinitionPtr index_definition; -}; -using LocalIndexInfos = std::vector; -using LocalIndexInfosPtr = std::shared_ptr; -using LocalIndexInfosSnapshot = std::shared_ptr; + TiDB::VectorIndexDefinitionPtr def_vector_index = nullptr; + + dtpb::IndexFileKind getKindAsDtpb() const + { + switch (kind) + { + case TiDB::ColumnarIndexKind::Vector: + return dtpb::IndexFileKind::VECTOR_INDEX; + default: + RUNTIME_CHECK_MSG(false, "Unsupported index kind: {}", magic_enum::enum_name(kind)); + } + } +}; LocalIndexInfosPtr initLocalIndexInfos(const TiDB::TableInfo & table_info, const LoggerPtr & logger); diff --git a/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo_fwd.h b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo_fwd.h new file mode 100644 index 00000000000..f230f0be2e9 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Index/LocalIndexInfo_fwd.h @@ -0,0 +1,30 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace DB::DM +{ + +struct LocalIndexInfo; + +using LocalIndexInfos = std::vector; +using LocalIndexInfosPtr = std::shared_ptr; +using LocalIndexInfosSnapshot = std::shared_ptr; + +struct LocalIndexInfosChangeset; + +} // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndex.cpp b/dbms/src/Storages/DeltaMerge/Index/VectorIndex.cpp index cfdce81a80e..4896856aa46 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndex.cpp +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndex.cpp @@ -39,7 +39,7 @@ bool VectorIndexBuilder::isSupportedType(const IDataType & type) return checkDataTypeArray(&type); } -VectorIndexBuilderPtr VectorIndexBuilder::create(IndexID index_id, const TiDB::VectorIndexDefinitionPtr & definition) +VectorIndexBuilderPtr VectorIndexBuilder::create(const TiDB::VectorIndexDefinitionPtr & definition) { RUNTIME_CHECK(definition->dimension > 0); RUNTIME_CHECK(definition->dimension <= TiDB::MAX_VECTOR_DIMENSION); @@ -47,52 +47,28 @@ VectorIndexBuilderPtr VectorIndexBuilder::create(IndexID index_id, const TiDB::V switch (definition->kind) { case tipb::VectorIndexKind::HNSW: - return std::make_shared(index_id, definition); + return std::make_shared(definition); default: throw Exception( // ErrorCodes::INCORRECT_QUERY, - "Unsupported vector index, index_id={} def={}", - index_id, + "Unsupported vector index, def={}", tipb::VectorIndexKind_Name(definition->kind)); } } -VectorIndexViewerPtr VectorIndexViewer::view(const dtpb::VectorIndexFileProps & file_props, std::string_view path) +VectorIndexViewerPtr VectorIndexViewer::view(const dtpb::IndexFilePropsV2Vector & file_props, std::string_view path) { RUNTIME_CHECK(file_props.dimensions() > 0); RUNTIME_CHECK(file_props.dimensions() <= TiDB::MAX_VECTOR_DIMENSION); - - tipb::VectorIndexKind kind; - RUNTIME_CHECK(tipb::VectorIndexKind_Parse(file_props.index_kind(), &kind)); - - switch (kind) - { - case tipb::VectorIndexKind::HNSW: - return VectorIndexHNSWViewer::view(file_props, path); - default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported vector index {}", file_props.index_kind()); - } + return VectorIndexHNSWViewer::view(file_props, path); } -VectorIndexViewerPtr VectorIndexViewer::load(const dtpb::VectorIndexFileProps & file_props, ReadBuffer & buf) +VectorIndexViewerPtr VectorIndexViewer::load(const dtpb::IndexFilePropsV2Vector & file_props, ReadBuffer & buf) { RUNTIME_CHECK(file_props.dimensions() > 0); RUNTIME_CHECK(file_props.dimensions() <= TiDB::MAX_VECTOR_DIMENSION); - - tipb::VectorIndexKind kind; - RUNTIME_CHECK(tipb::VectorIndexKind_Parse(file_props.index_kind(), &kind)); - - switch (kind) - { - case tipb::VectorIndexKind::HNSW: - return VectorIndexHNSWViewer::load(file_props, buf); - default: - throw Exception( // - ErrorCodes::INCORRECT_QUERY, - "Unsupported vector index {}", - file_props.index_kind()); - } + return VectorIndexHNSWViewer::load(file_props, buf); } } // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndex.h b/dbms/src/Storages/DeltaMerge/Index/VectorIndex.h index c6affd5ad85..adfa593be1d 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndex.h +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndex.h @@ -39,14 +39,13 @@ class VectorIndexBuilder using ProceedCheckFn = std::function; public: - static VectorIndexBuilderPtr create(IndexID index_id, const TiDB::VectorIndexDefinitionPtr & definition); + static VectorIndexBuilderPtr create(const TiDB::VectorIndexDefinitionPtr & definition); static bool isSupportedType(const IDataType & type); public: - explicit VectorIndexBuilder(IndexID index_id_, const TiDB::VectorIndexDefinitionPtr & definition_) - : index_id(index_id_) - , definition(definition_) + explicit VectorIndexBuilder(const TiDB::VectorIndexDefinitionPtr & definition_) + : definition(definition_) {} virtual ~VectorIndexBuilder() = default; @@ -61,7 +60,6 @@ class VectorIndexBuilder virtual void saveToBuffer(WriteBuffer & write_buf) const = 0; public: - const IndexID index_id; const TiDB::VectorIndexDefinitionPtr definition; }; @@ -85,11 +83,11 @@ class VectorIndexViewer using RowFilter = BitmapFilterView; public: - static VectorIndexViewerPtr view(const dtpb::VectorIndexFileProps & file_props, std::string_view path); - static VectorIndexViewerPtr load(const dtpb::VectorIndexFileProps & file_props, ReadBuffer & buf); + static VectorIndexViewerPtr view(const dtpb::IndexFilePropsV2Vector & file_props, std::string_view path); + static VectorIndexViewerPtr load(const dtpb::IndexFilePropsV2Vector & file_props, ReadBuffer & buf); public: - explicit VectorIndexViewer(const dtpb::VectorIndexFileProps & file_props_) + explicit VectorIndexViewer(const dtpb::IndexFilePropsV2Vector & file_props_) : file_props(file_props_) {} @@ -104,7 +102,7 @@ class VectorIndexViewer virtual void get(Key key, std::vector & out) const = 0; public: - const dtpb::VectorIndexFileProps file_props; + const dtpb::IndexFilePropsV2Vector file_props; }; } // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache.h b/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache.h index c76ab4f6ed4..cd24a94a563 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache.h +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache_fwd.h b/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache_fwd.h new file mode 100644 index 00000000000..80c951d6992 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndexCache_fwd.h @@ -0,0 +1,25 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace DB::DM +{ + +class VectorIndexCache; +using VectorIndexCachePtr = std::shared_ptr; + +} // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.cpp b/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.cpp index 77a4a440baa..33877ba370b 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.cpp +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.cpp @@ -55,8 +55,8 @@ unum::usearch::metric_kind_t getUSearchMetricKind(tipb::VectorDistanceMetric d) } } -VectorIndexHNSWBuilder::VectorIndexHNSWBuilder(IndexID index_id_, const TiDB::VectorIndexDefinitionPtr & definition_) - : VectorIndexBuilder(index_id_, definition_) +VectorIndexHNSWBuilder::VectorIndexHNSWBuilder(const TiDB::VectorIndexDefinitionPtr & definition_) + : VectorIndexBuilder(definition_) , index(USearchImplType::make(unum::usearch::metric_punned_t( // definition_->dimension, getUSearchMetricKind(definition->distance_metric)))) @@ -165,10 +165,8 @@ tipb::VectorIndexKind VectorIndexHNSWBuilder::kind() return tipb::VectorIndexKind::HNSW; } -VectorIndexViewerPtr VectorIndexHNSWViewer::view(const dtpb::VectorIndexFileProps & file_props, std::string_view path) +VectorIndexViewerPtr VectorIndexHNSWViewer::view(const dtpb::IndexFilePropsV2Vector & file_props, std::string_view path) { - RUNTIME_CHECK(file_props.index_kind() == tipb::VectorIndexKind_Name(kind())); - tipb::VectorDistanceMetric metric; RUNTIME_CHECK(tipb::VectorDistanceMetric_Parse(file_props.distance_metric(), &metric)); RUNTIME_CHECK(metric != tipb::VectorDistanceMetric::INVALID_DISTANCE_METRIC); @@ -206,10 +204,8 @@ VectorIndexViewerPtr VectorIndexHNSWViewer::view(const dtpb::VectorIndexFileProp return vi; } -VectorIndexViewerPtr VectorIndexHNSWViewer::load(const dtpb::VectorIndexFileProps & file_props, ReadBuffer & buf) +VectorIndexViewerPtr VectorIndexHNSWViewer::load(const dtpb::IndexFilePropsV2Vector & file_props, ReadBuffer & buf) { - RUNTIME_CHECK(file_props.index_kind() == tipb::VectorIndexKind_Name(kind())); - tipb::VectorDistanceMetric metric; RUNTIME_CHECK(tipb::VectorDistanceMetric_Parse(file_props.distance_metric(), &metric)); RUNTIME_CHECK(metric != tipb::VectorDistanceMetric::INVALID_DISTANCE_METRIC); @@ -339,7 +335,7 @@ void VectorIndexHNSWViewer::get(Key key, std::vector & out) const index.get(key, out.data()); } -VectorIndexHNSWViewer::VectorIndexHNSWViewer(const dtpb::VectorIndexFileProps & props) +VectorIndexHNSWViewer::VectorIndexHNSWViewer(const dtpb::IndexFilePropsV2Vector & props) : VectorIndexViewer(props) { GET_METRIC(tiflash_vector_index_active_instances, type_view).Increment(); diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.h b/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.h index 5683334ba35..f1c136749b4 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.h +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndexHNSW/Index.h @@ -29,7 +29,7 @@ class VectorIndexHNSWBuilder : public VectorIndexBuilder public: static tipb::VectorIndexKind kind(); - explicit VectorIndexHNSWBuilder(IndexID index_id_, const TiDB::VectorIndexDefinitionPtr & definition_); + explicit VectorIndexHNSWBuilder(const TiDB::VectorIndexDefinitionPtr & definition_); ~VectorIndexHNSWBuilder() override; @@ -49,12 +49,12 @@ class VectorIndexHNSWBuilder : public VectorIndexBuilder class VectorIndexHNSWViewer : public VectorIndexViewer { public: - static VectorIndexViewerPtr view(const dtpb::VectorIndexFileProps & props, std::string_view path); - static VectorIndexViewerPtr load(const dtpb::VectorIndexFileProps & file_props, ReadBuffer & buf); + static VectorIndexViewerPtr view(const dtpb::IndexFilePropsV2Vector & props, std::string_view path); + static VectorIndexViewerPtr load(const dtpb::IndexFilePropsV2Vector & file_props, ReadBuffer & buf); static tipb::VectorIndexKind kind(); - explicit VectorIndexHNSWViewer(const dtpb::VectorIndexFileProps & props); + explicit VectorIndexHNSWViewer(const dtpb::IndexFilePropsV2Vector & props); ~VectorIndexHNSWViewer() override; diff --git a/dbms/src/Storages/DeltaMerge/Index/VectorIndex_fwd.h b/dbms/src/Storages/DeltaMerge/Index/VectorIndex_fwd.h index 131715302e5..b785b4db85e 100644 --- a/dbms/src/Storages/DeltaMerge/Index/VectorIndex_fwd.h +++ b/dbms/src/Storages/DeltaMerge/Index/VectorIndex_fwd.h @@ -27,7 +27,4 @@ using VectorIndexBuilderPtr = std::shared_ptr; class VectorIndexViewer; using VectorIndexViewerPtr = std::shared_ptr; -class VectorIndexCache; -using VectorIndexCachePtr = std::shared_ptr; - } // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Remote/Proto/remote.proto b/dbms/src/Storages/DeltaMerge/Remote/Proto/remote.proto index 64d904404e4..be2747ce2ec 100644 --- a/dbms/src/Storages/DeltaMerge/Remote/Proto/remote.proto +++ b/dbms/src/Storages/DeltaMerge/Remote/Proto/remote.proto @@ -74,25 +74,6 @@ message ColumnFileInMemory { uint64 rows = 3; } -// Note: This message is something different to VectorIndexDefinition. -// VectorIndexDefinition defines an index, comes from table DDL. -// It includes information about how index should be constructed, -// for example, it contains HNSW's 'efConstruct' parameter. -// However, VectorIndexFileProps provides information for read out the index, -// for example, very basic information about what the index is, and how it is stored. -message VectorIndexFileProps { - string index_kind = 1; // The value is tipb.VectorIndexKind - string distance_metric = 2; // The value is tipb.VectorDistanceMetric - uint64 dimensions = 3; - int64 index_id = 4; - uint64 index_bytes = 5; -} - -message IndexInfo { - uint64 index_page_id = 1; // which page it stores - VectorIndexFileProps vector_index = 2; // vector index file properties, needed when read -} - message ColumnFileTiny { uint64 page_id = 1; uint64 page_size = 2; @@ -104,7 +85,8 @@ message ColumnFileTiny { uint64 rows = 6; uint64 bytes = 7; - repeated IndexInfo indexes = 8; + // Serialized in column_file.ColumnFileIndexInfo + repeated bytes indexes = 8; } message ColumnFileBig { diff --git a/dbms/src/Storages/DeltaMerge/Remote/Serializer.cpp b/dbms/src/Storages/DeltaMerge/Remote/Serializer.cpp index 26cde111db5..6ed136b9b74 100644 --- a/dbms/src/Storages/DeltaMerge/Remote/Serializer.cpp +++ b/dbms/src/Storages/DeltaMerge/Remote/Serializer.cpp @@ -358,18 +358,8 @@ RemotePb::ColumnFileRemote Serializer::serializeCFTiny( for (const auto & index_info : *cf_tiny.index_infos) { - auto * index_pb = remote_tiny->add_indexes(); - index_pb->set_index_page_id(index_info.index_page_id); - if (index_info.vector_index.has_value()) - { - RemotePb::VectorIndexFileProps index_props; - index_props.set_index_kind(index_info.vector_index->index_kind()); - index_props.set_distance_metric(index_info.vector_index->distance_metric()); - index_props.set_dimensions(index_info.vector_index->dimensions()); - index_props.set_index_id(index_info.vector_index->index_id()); - index_props.set_index_bytes(index_info.vector_index->index_bytes()); - index_pb->mutable_vector_index()->Swap(&index_props); - } + auto serialized = index_info.SerializeAsString(); + remote_tiny->add_indexes(std::move(serialized)); } // TODO: read the checkpoint info from data_provider and send it to the compute node @@ -391,18 +381,10 @@ ColumnFileTinyPtr Serializer::deserializeCFTiny(const DMContext & dm_context, co index_infos->reserve(proto.indexes().size()); for (const auto & index_pb : proto.indexes()) { - if (index_pb.has_vector_index()) - { - dtpb::VectorIndexFileProps index_props; - index_props.set_index_kind(index_pb.vector_index().index_kind()); - index_props.set_distance_metric(index_pb.vector_index().distance_metric()); - index_props.set_dimensions(index_pb.vector_index().dimensions()); - index_props.set_index_id(index_pb.vector_index().index_id()); - index_props.set_index_bytes(index_pb.vector_index().index_bytes()); - index_infos->emplace_back(index_pb.index_page_id(), index_props); - } - else - index_infos->emplace_back(index_pb.index_page_id(), std::nullopt); + auto index_info = dtpb::ColumnFileIndexInfo{}; + auto ok = index_info.ParseFromString(index_pb); + RUNTIME_CHECK_MSG(ok, "Failed to parse ColumnFileIndexInfo from proto"); + index_infos->emplace_back(std::move(index_info)); } auto cf = std::make_shared( diff --git a/dbms/src/Storages/DeltaMerge/SegmentReadTask.cpp b/dbms/src/Storages/DeltaMerge/SegmentReadTask.cpp index afe4a2b076e..10ed1b6326f 100644 --- a/dbms/src/Storages/DeltaMerge/SegmentReadTask.cpp +++ b/dbms/src/Storages/DeltaMerge/SegmentReadTask.cpp @@ -137,12 +137,11 @@ SegmentReadTask::SegmentReadTask( { for (const auto & index_info : *index_infos) { - if (index_info.vector_index) - { - remote_page_ids.emplace_back(index_info.index_page_id); - remote_page_sizes.emplace_back(index_info.vector_index->index_bytes()); - ++count; - } + RUNTIME_CHECK(index_info.has_index_page_id()); + RUNTIME_CHECK(index_info.index_props().has_kind()); + remote_page_ids.emplace_back(index_info.index_page_id()); + remote_page_sizes.emplace_back(index_info.index_props().file_size()); + ++count; } } } diff --git a/dbms/src/Storages/DeltaMerge/dtpb/column_file.proto b/dbms/src/Storages/DeltaMerge/dtpb/column_file.proto index 34b63372a8f..95dd2e32058 100644 --- a/dbms/src/Storages/DeltaMerge/dtpb/column_file.proto +++ b/dbms/src/Storages/DeltaMerge/dtpb/column_file.proto @@ -12,11 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Note: The `required` and `optional` keyword in this file does not make any sense. +// -- For example, a field marked `optional` could be possibly required at the application level. +// The reason why we keep them is to keep the compatibility with the old code. +// Don't introduce required fields any more!! +// New fields should use optional instead. +// See https://protobuf.dev/programming-guides/proto2/ + syntax = "proto2"; package dtpb; -import "vector_index.proto"; // for VectorIndexFileProps +import "index_file_deprecated.proto"; +import "index_file.proto"; // for IndexFilePropsV2 message ColumnFileBig { required uint64 id = 1; @@ -42,9 +50,10 @@ message ColumnSchema { required string column_type = 3; } -message IndexInfo { +message ColumnFileIndexInfo { required uint64 index_page_id = 1; // which page it stores - optional VectorIndexFileProps vector_index = 2; // vector index file properties, needed when read + optional VectorIndexFilePropsV1Deprecated deprecated_vector_index = 2; // deprecated. See field 3. + optional IndexFilePropsV2 index_props = 3; } message ColumnFileTiny { @@ -52,7 +61,7 @@ message ColumnFileTiny { required uint64 id = 2; required uint64 rows = 3; required uint64 bytes = 4; - repeated IndexInfo indexes = 5; + repeated ColumnFileIndexInfo indexes = 5; } enum ColumnFileType { diff --git a/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto b/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto index ca2cf1dbcc3..8191bc7a370 100644 --- a/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto +++ b/dbms/src/Storages/DeltaMerge/dtpb/dmfile.proto @@ -16,7 +16,8 @@ syntax = "proto2"; package dtpb; -import "vector_index.proto"; // for VectorIndexFileProps +import "index_file_deprecated.proto"; +import "index_file.proto"; // for IndexFilePropsV2 message PackProperty { // when gc_safe_point exceed this version, there must be some data obsolete in this pack @@ -66,8 +67,18 @@ message ColumnStat { // Only used in tests. Modifying other fields of ColumnStat is hard. optional string additional_data_for_test = 101; - optional VectorIndexFileProps vector_index = 102; - repeated VectorIndexFileProps vector_indexes = 103; + optional VectorIndexFilePropsV1Deprecated deprecated_vector_index = 102; // deprecated. See field 104. + repeated VectorIndexFilePropsV1Deprecated deprecated_vector_indexes = 103; // deprecated. See field 104. + repeated DMFileIndexInfo indexes = 104; +} + +message DMFileIndexInfo { + // Currently it only contains nothing more than IndexFilePropsV2. + // However let's keep using this standalone message, because IndexFilePropsV2 + // is also used by ColumnFile and there could be possibly something only + // needed by DMFile but not needed by ColumnFile. + + optional IndexFilePropsV2 index_props = 1; } message ColumnStats { diff --git a/dbms/src/Storages/DeltaMerge/dtpb/index_file.proto b/dbms/src/Storages/DeltaMerge/dtpb/index_file.proto new file mode 100644 index 00000000000..4ec2b1539ae --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/dtpb/index_file.proto @@ -0,0 +1,44 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package dtpb; + +enum IndexFileKind { + INVALID = 0; + VECTOR_INDEX = 1; +} + +message IndexFilePropsV2 { + optional IndexFileKind kind = 1; + optional int64 index_id = 2; // For which index ID + optional uint64 file_size = 3; + + oneof prop { + IndexFilePropsV2Vector vector_index = 31; + } +} + +// Note: This message is something different to VectorIndexDefinition. +// VectorIndexDefinition defines an index, comes from table DDL. +// It includes information about how index should be constructed, +// for example, it contains HNSW's 'efConstruct' parameter. +// However, IndexFilePropsV2Vector provides information for read out the index, +// for example, very basic information about what the index is, and how it is stored. +message IndexFilePropsV2Vector { + optional uint32 format_version = 1; // Currently it must be 0. + optional string distance_metric = 2; // The value is tipb.VectorDistanceMetric + optional uint64 dimensions = 3; +} diff --git a/dbms/src/Storages/DeltaMerge/dtpb/index_file_deprecated.proto b/dbms/src/Storages/DeltaMerge/dtpb/index_file_deprecated.proto new file mode 100644 index 00000000000..8d4bf3eafe9 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/dtpb/index_file_deprecated.proto @@ -0,0 +1,34 @@ +// Copyright 2025 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package dtpb; + +// Note: This message is something different to VectorIndexDefinition. +// VectorIndexDefinition defines an index, comes from table DDL. +// It includes information about how index should be constructed, +// for example, it contains HNSW's 'efConstruct' parameter. +// However, VectorIndexFilePropsV1Deprecated provides information for read out the index, +// for example, very basic information about what the index is, and how it is stored. +message VectorIndexFilePropsV1Deprecated { + // This message is deprecated. + // Dont generate new message with it. + + optional string index_kind = 1; // The value is tipb.VectorIndexKind + optional string distance_metric = 2; // The value is tipb.VectorDistanceMetric + optional uint64 dimensions = 3; + optional int64 index_id = 4; // deprecated. Don't use it. + optional uint64 index_bytes = 5; // deprecated. Don't use it. +} diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_column_file_clone.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_column_file_clone.cpp index b29abae5878..b43e4d83e67 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_column_file_clone.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_column_file_clone.cpp @@ -36,16 +36,21 @@ TEST_F(ColumnFileCloneTest, CloneColumnFileTinyWithVectorIndex) String mock_index_page_content = "mock_index_page_content"; wbs.log.putPage(index_page_id, 0, std::string_view{mock_index_page_content.data(), mock_index_page_content.size()}); - dtpb::VectorIndexFileProps index_props; - index_props.set_index_kind(tipb::VectorIndexKind_Name(tipb::VectorIndexKind::HNSW)); - index_props.set_distance_metric(tipb::VectorDistanceMetric_Name(tipb::VectorDistanceMetric::L2)); - index_props.set_dimensions(1); - index_props.set_index_id(1); - index_props.set_index_bytes(10); std::vector persisted_files; { auto index_infos = std::make_shared(); - index_infos->emplace_back(data_page_id, index_props); + auto index_info = dtpb::ColumnFileIndexInfo{}; + index_info.set_index_page_id(index_page_id); + auto * index_props = index_info.mutable_index_props(); + index_props->set_kind(dtpb::IndexFileKind::VECTOR_INDEX); + index_props->set_index_id(1); + index_props->set_file_size(10); + auto * vec_props = index_props->mutable_vector_index(); + vec_props->set_format_version(0); + vec_props->set_dimensions(1); + vec_props->set_distance_metric(tipb::VectorDistanceMetric_Name(tipb::VectorDistanceMetric::L2)); + index_infos->emplace_back(std::move(index_info)); + auto file = std::make_shared(nullptr, 1, 10, data_page_id, *dm_context, index_infos); persisted_files.emplace_back(std::move(file)); } @@ -62,15 +67,16 @@ TEST_F(ColumnFileCloneTest, CloneColumnFileTinyWithVectorIndex) ASSERT_EQ(new_tiny_file->getIndexInfos()->size(), 1); auto new_index_info = new_tiny_file->getIndexInfos()->at(0); // Different index page id - ASSERT_NE(new_index_info.index_page_id, index_page_id); + ASSERT_NE(new_index_info.index_page_id(), index_page_id); // Same index properties - ASSERT_EQ(new_index_info.vector_index->index_bytes(), 10); - ASSERT_EQ(new_index_info.vector_index->index_id(), 1); - ASSERT_EQ(new_index_info.vector_index->index_kind(), tipb::VectorIndexKind_Name(tipb::VectorIndexKind::HNSW)); + ASSERT_EQ(new_index_info.index_props().kind(), dtpb::IndexFileKind::VECTOR_INDEX); + ASSERT_EQ(new_index_info.index_props().index_id(), 1); + ASSERT_EQ(new_index_info.index_props().file_size(), 10); + ASSERT_EQ(new_index_info.index_props().vector_index().format_version(), 0); ASSERT_EQ( - new_index_info.vector_index->distance_metric(), + new_index_info.index_props().vector_index().distance_metric(), tipb::VectorDistanceMetric_Name(tipb::VectorDistanceMetric::L2)); - ASSERT_EQ(new_index_info.vector_index->dimensions(), 1); + ASSERT_EQ(new_index_info.index_props().vector_index().dimensions(), 1); // Check the data page and index page content auto storage_snap = std::make_shared( diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store_vector_index.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store_vector_index.cpp index 0072eb0bd89..e3aaf3ced89 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store_vector_index.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store_vector_index.cpp @@ -780,12 +780,12 @@ try ASSERT_NE(local_index_snap, nullptr); ASSERT_EQ(local_index_snap->size(), 1); const auto & index = (*local_index_snap)[0]; - ASSERT_EQ(index.type, IndexType::Vector); + ASSERT_EQ(index.kind, TiDB::ColumnarIndexKind::Vector); ASSERT_EQ(index.index_id, EmptyIndexID); ASSERT_EQ(index.column_id, vec_column_id); - ASSERT_EQ(index.index_definition->kind, tipb::VectorIndexKind::HNSW); - ASSERT_EQ(index.index_definition->dimension, 1); - ASSERT_EQ(index.index_definition->distance_metric, tipb::VectorDistanceMetric::L2); + ASSERT_EQ(index.def_vector_index->kind, tipb::VectorIndexKind::HNSW); + ASSERT_EQ(index.def_vector_index->dimension, 1); + ASSERT_EQ(index.def_vector_index->distance_metric, tipb::VectorDistanceMetric::L2); } const size_t num_rows_write = 128; @@ -817,12 +817,12 @@ try ASSERT_NE(local_index_snap, nullptr); ASSERT_EQ(local_index_snap->size(), 1); const auto & index = (*local_index_snap)[0]; - ASSERT_EQ(index.type, IndexType::Vector); + ASSERT_EQ(index.kind, TiDB::ColumnarIndexKind::Vector); ASSERT_EQ(index.index_id, EmptyIndexID); ASSERT_EQ(index.column_id, vec_column_id); - ASSERT_EQ(index.index_definition->kind, tipb::VectorIndexKind::HNSW); - ASSERT_EQ(index.index_definition->dimension, 1); - ASSERT_EQ(index.index_definition->distance_metric, tipb::VectorDistanceMetric::L2); + ASSERT_EQ(index.def_vector_index->kind, tipb::VectorIndexKind::HNSW); + ASSERT_EQ(index.def_vector_index->dimension, 1); + ASSERT_EQ(index.def_vector_index->distance_metric, tipb::VectorDistanceMetric::L2); } const auto range = RowKeyRange::newAll(store->is_common_handle, store->rowkey_column_size); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp index 79334f6593c..9ee3cc00458 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index.cpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -127,8 +127,8 @@ class VectorIndexDMFileTest DMFilePtr buildIndex(TiDB::VectorIndexDefinition definition) { - auto build_info = DMFileVectorIndexWriter::getLocalIndexBuildInfo(indexInfo(definition), {dm_file}); - DMFileVectorIndexWriter iw(DMFileVectorIndexWriter::Options{ + auto build_info = DMFileLocalIndexWriter::getLocalIndexBuildInfo(indexInfo(definition), {dm_file}); + DMFileLocalIndexWriter iw(DMFileLocalIndexWriter::Options{ .path_pool = path_pool, .index_infos = build_info.indexes_to_build, .dm_files = {dm_file}, @@ -142,8 +142,8 @@ class VectorIndexDMFileTest DMFilePtr buildMultiIndex(const LocalIndexInfosPtr & index_infos) { assert(index_infos != nullptr); - auto build_info = DMFileVectorIndexWriter::getLocalIndexBuildInfo(index_infos, {dm_file}); - DMFileVectorIndexWriter iw(DMFileVectorIndexWriter::Options{ + auto build_info = DMFileLocalIndexWriter::getLocalIndexBuildInfo(index_infos, {dm_file}); + DMFileLocalIndexWriter iw(DMFileLocalIndexWriter::Options{ .path_pool = path_pool, .index_infos = build_info.indexes_to_build, .dm_files = {dm_file}, @@ -536,10 +536,10 @@ try auto index_infos = std::make_shared(LocalIndexInfos{ // index with index_id == 3 LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = 3, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 3, .distance_metric = tipb::VectorDistanceMetric::L2, @@ -547,10 +547,10 @@ try }, // index with index_id == 4 LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = 4, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 3, .distance_metric = tipb::VectorDistanceMetric::COSINE, @@ -558,10 +558,10 @@ try }, // index with index_id == EmptyIndexID, column_id = vec_column_id LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = EmptyIndexID, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 3, .distance_metric = tipb::VectorDistanceMetric::L2, @@ -1876,10 +1876,10 @@ class VectorIndexSegmentOnS3Test auto index_infos = std::make_shared(LocalIndexInfos{ // index with index_id == 3 LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = 3, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 1, .distance_metric = tipb::VectorDistanceMetric::L2, @@ -1887,10 +1887,10 @@ class VectorIndexSegmentOnS3Test }, // index with index_id == 4 LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = 4, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 1, .distance_metric = tipb::VectorDistanceMetric::COSINE, @@ -1898,20 +1898,20 @@ class VectorIndexSegmentOnS3Test }, // index with index_id == EmptyIndexID, column_id = vec_column_id LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = EmptyIndexID, .column_id = vec_column_id, - .index_definition = std::make_shared(TiDB::VectorIndexDefinition{ + .def_vector_index = std::make_shared(TiDB::VectorIndexDefinition{ .kind = tipb::VectorIndexKind::HNSW, .dimension = 1, .distance_metric = tipb::VectorDistanceMetric::L2, }), }, }); - auto build_info = DMFileVectorIndexWriter::getLocalIndexBuildInfo(index_infos, dm_files); + auto build_info = DMFileLocalIndexWriter::getLocalIndexBuildInfo(index_infos, dm_files); // Build multiple index - DMFileVectorIndexWriter iw(DMFileVectorIndexWriter::Options{ + DMFileLocalIndexWriter iw(DMFileLocalIndexWriter::Options{ .path_pool = storage_path_pool, .index_infos = build_info.indexes_to_build, .dm_files = dm_files, diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index_utils.h b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index_utils.h index 9fb34e43031..12e54e6b9cd 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index_utils.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_vector_index_utils.h @@ -23,6 +23,7 @@ #include #include #include +#include namespace DB::DM::tests @@ -85,10 +86,10 @@ class VectorIndexTestUtils { const LocalIndexInfos index_infos = LocalIndexInfos{ LocalIndexInfo{ - .type = IndexType::Vector, + .kind = TiDB::ColumnarIndexKind::Vector, .index_id = EmptyIndexID, .column_id = vec_column_id, - .index_definition = std::make_shared(definition), + .def_vector_index = std::make_shared(definition), }, }; return std::make_shared(index_infos); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_local_index_info.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_local_index_info.cpp index 536fd5c7586..9f82e0b824c 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_local_index_info.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_local_index_info.cpp @@ -22,7 +22,7 @@ namespace DB::FailPoints { -extern const char force_not_support_vector_index[]; +extern const char force_not_support_local_index[]; } // namespace DB::FailPoints namespace DB::DM::tests { @@ -68,7 +68,7 @@ try table_info.index_infos.emplace_back(expect_idx); } - FailPointHelper::enableFailPoint(FailPoints::force_not_support_vector_index); + FailPointHelper::enableFailPoint(FailPoints::force_not_support_local_index); // check the result when storage format not support auto new_index_info = generateLocalIndexInfos(index_info, table_info, logger).new_local_index_infos; @@ -125,13 +125,13 @@ try ASSERT_NE(new_index_info, nullptr); ASSERT_EQ(new_index_info->size(), 1); const auto & idx = (*new_index_info)[0]; - ASSERT_EQ(IndexType::Vector, idx.type); + ASSERT_EQ(TiDB::ColumnarIndexKind::Vector, idx.kind); ASSERT_EQ(expect_idx.id, idx.index_id); ASSERT_EQ(100, idx.column_id); - ASSERT_NE(nullptr, idx.index_definition); - ASSERT_EQ(expect_idx.vector_index->kind, idx.index_definition->kind); - ASSERT_EQ(expect_idx.vector_index->dimension, idx.index_definition->dimension); - ASSERT_EQ(expect_idx.vector_index->distance_metric, idx.index_definition->distance_metric); + ASSERT_NE(nullptr, idx.def_vector_index); + ASSERT_EQ(expect_idx.vector_index->kind, idx.def_vector_index->kind); + ASSERT_EQ(expect_idx.vector_index->dimension, idx.def_vector_index->dimension); + ASSERT_EQ(expect_idx.vector_index->distance_metric, idx.def_vector_index->distance_metric); // check again, nothing changed, return nullptr ASSERT_EQ(nullptr, generateLocalIndexInfos(new_index_info, table_info, logger).new_local_index_infos); @@ -158,21 +158,21 @@ try ASSERT_NE(new_index_info, nullptr); ASSERT_EQ(new_index_info->size(), 2); const auto & idx0 = (*new_index_info)[0]; - ASSERT_EQ(IndexType::Vector, idx0.type); + ASSERT_EQ(TiDB::ColumnarIndexKind::Vector, idx0.kind); ASSERT_EQ(expect_idx.id, idx0.index_id); ASSERT_EQ(100, idx0.column_id); - ASSERT_NE(nullptr, idx0.index_definition); - ASSERT_EQ(expect_idx.vector_index->kind, idx0.index_definition->kind); - ASSERT_EQ(expect_idx.vector_index->dimension, idx0.index_definition->dimension); - ASSERT_EQ(expect_idx.vector_index->distance_metric, idx0.index_definition->distance_metric); + ASSERT_NE(nullptr, idx0.def_vector_index); + ASSERT_EQ(expect_idx.vector_index->kind, idx0.def_vector_index->kind); + ASSERT_EQ(expect_idx.vector_index->dimension, idx0.def_vector_index->dimension); + ASSERT_EQ(expect_idx.vector_index->distance_metric, idx0.def_vector_index->distance_metric); const auto & idx1 = (*new_index_info)[1]; - ASSERT_EQ(IndexType::Vector, idx1.type); + ASSERT_EQ(TiDB::ColumnarIndexKind::Vector, idx1.kind); ASSERT_EQ(expect_idx2.id, idx1.index_id); ASSERT_EQ(100, idx1.column_id); - ASSERT_NE(nullptr, idx1.index_definition); - ASSERT_EQ(expect_idx2.vector_index->kind, idx1.index_definition->kind); - ASSERT_EQ(expect_idx2.vector_index->dimension, idx1.index_definition->dimension); - ASSERT_EQ(expect_idx2.vector_index->distance_metric, idx1.index_definition->distance_metric); + ASSERT_NE(nullptr, idx1.def_vector_index); + ASSERT_EQ(expect_idx2.vector_index->kind, idx1.def_vector_index->kind); + ASSERT_EQ(expect_idx2.vector_index->dimension, idx1.def_vector_index->dimension); + ASSERT_EQ(expect_idx2.vector_index->distance_metric, idx1.def_vector_index->distance_metric); // check again, nothing changed, return nullptr ASSERT_EQ(nullptr, generateLocalIndexInfos(new_index_info, table_info, logger).new_local_index_infos); @@ -202,21 +202,21 @@ try ASSERT_NE(new_index_info, nullptr); ASSERT_EQ(new_index_info->size(), 2); const auto & idx0 = (*new_index_info)[0]; - ASSERT_EQ(IndexType::Vector, idx0.type); + ASSERT_EQ(TiDB::ColumnarIndexKind::Vector, idx0.kind); ASSERT_EQ(expect_idx.id, idx0.index_id); ASSERT_EQ(100, idx0.column_id); - ASSERT_NE(nullptr, idx0.index_definition); - ASSERT_EQ(expect_idx.vector_index->kind, idx0.index_definition->kind); - ASSERT_EQ(expect_idx.vector_index->dimension, idx0.index_definition->dimension); - ASSERT_EQ(expect_idx.vector_index->distance_metric, idx0.index_definition->distance_metric); + ASSERT_NE(nullptr, idx0.def_vector_index); + ASSERT_EQ(expect_idx.vector_index->kind, idx0.def_vector_index->kind); + ASSERT_EQ(expect_idx.vector_index->dimension, idx0.def_vector_index->dimension); + ASSERT_EQ(expect_idx.vector_index->distance_metric, idx0.def_vector_index->distance_metric); const auto & idx1 = (*new_index_info)[1]; - ASSERT_EQ(IndexType::Vector, idx1.type); + ASSERT_EQ(TiDB::ColumnarIndexKind::Vector, idx1.kind); ASSERT_EQ(expect_idx3.id, idx1.index_id); ASSERT_EQ(100, idx1.column_id); - ASSERT_NE(nullptr, idx1.index_definition); - ASSERT_EQ(expect_idx3.vector_index->kind, idx1.index_definition->kind); - ASSERT_EQ(expect_idx3.vector_index->dimension, idx1.index_definition->dimension); - ASSERT_EQ(expect_idx3.vector_index->distance_metric, idx1.index_definition->distance_metric); + ASSERT_NE(nullptr, idx1.def_vector_index); + ASSERT_EQ(expect_idx3.vector_index->kind, idx1.def_vector_index->kind); + ASSERT_EQ(expect_idx3.vector_index->dimension, idx1.def_vector_index->dimension); + ASSERT_EQ(expect_idx3.vector_index->distance_metric, idx1.def_vector_index->distance_metric); // check again, nothing changed, return nullptr ASSERT_EQ(nullptr, generateLocalIndexInfos(new_index_info, table_info, logger).new_local_index_infos); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 788f3a7e993..f6639bcc3f5 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -792,10 +792,10 @@ bool SegmentTestBasic::ensureSegmentStableLocalIndex(PageIdU64 segment_id, const bool success = false; auto segment = segments[segment_id]; auto dm_files = segment->getStable()->getDMFiles(); - auto build_info = DMFileVectorIndexWriter::getLocalIndexBuildInfo(local_index_infos, dm_files); + auto build_info = DMFileLocalIndexWriter::getLocalIndexBuildInfo(local_index_infos, dm_files); // Build index - DMFileVectorIndexWriter iw(DMFileVectorIndexWriter::Options{ + DMFileLocalIndexWriter iw(DMFileLocalIndexWriter::Options{ .path_pool = storage_path_pool, .index_infos = build_info.indexes_to_build, .dm_files = dm_files, @@ -937,9 +937,10 @@ SegmentPtr SegmentTestBasic::reload( storage_path_pool = std::make_shared(db_context->getPathPool().withTable("test", "t1", false)); storage_pool = std::make_shared(*db_context, NullspaceID, NAMESPACE_ID, *storage_path_pool, "test.t1"); storage_pool->restore(); - ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns( - is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) - : pre_define_columns; + ColumnDefinesPtr cols = (!pre_define_columns) + ? DMTestEnv::getDefaultColumns( + is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) + : pre_define_columns; prepareColumns(cols); setColumns(cols); diff --git a/dbms/src/TiDB/Schema/TiDB.cpp b/dbms/src/TiDB/Schema/TiDB.cpp index 5f846e1cd4b..18fea00e386 100644 --- a/dbms/src/TiDB/Schema/TiDB.cpp +++ b/dbms/src/TiDB/Schema/TiDB.cpp @@ -1061,7 +1061,7 @@ try // always put the primary_index at the front of all index_info index_infos.insert(index_infos.begin(), std::move(index_info)); } - else if (index_info.vector_index != nullptr) + else if (index_info.hasColumnarIndex()) { index_infos.emplace_back(std::move(index_info)); } diff --git a/dbms/src/TiDB/Schema/TiDB.h b/dbms/src/TiDB/Schema/TiDB.h index 09dda11ac69..89c5ca7938a 100644 --- a/dbms/src/TiDB/Schema/TiDB.h +++ b/dbms/src/TiDB/Schema/TiDB.h @@ -240,6 +240,17 @@ struct IndexColumnInfo Int32 length = 0; Int32 offset = 0; }; + +// Note: Columnar index and local index are usually referring the same thing +// in this code base: +// - From TiDB's perspective, it is a columnar index. +// - From TiFlash's perspective, it is a local index. +enum class ColumnarIndexKind +{ + // Leave 0 intentionally for InvalidValues + Vector = 1, +}; + struct IndexInfo { IndexInfo() = default; @@ -261,6 +272,16 @@ struct IndexInfo bool is_global = false; VectorIndexDefinitionPtr vector_index = nullptr; + + ColumnarIndexKind columnarIndexKind() const + { + RUNTIME_CHECK(hasColumnarIndex()); + if (vector_index) + return ColumnarIndexKind::Vector; + RUNTIME_CHECK(false); + } + + bool hasColumnarIndex() const { return (vector_index != nullptr); } }; struct TableInfo From f0acfaa2e981f5a1f8c44135ea94ea4c4ed32470 Mon Sep 17 00:00:00 2001 From: Wish Date: Wed, 8 Jan 2025 07:17:02 +0800 Subject: [PATCH 2/4] Fix Signed-off-by: Wish --- dbms/src/Common/FailPoint.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp index be684cf3751..ce3a8fcc4e0 100644 --- a/dbms/src/Common/FailPoint.cpp +++ b/dbms/src/Common/FailPoint.cpp @@ -73,7 +73,7 @@ namespace DB M(force_wait_index_timeout) \ M(force_local_index_task_memory_limit_exceeded) \ M(exception_build_local_index_for_file) \ - M(force_not_support_vector_index) \ + M(force_not_support_local_index) \ M(sync_schema_request_failure) #define APPLY_FOR_FAILPOINTS(M) \ From f2df29cf97d32bed9619bf4e9b458520cda7262f Mon Sep 17 00:00:00 2001 From: Wish Date: Wed, 8 Jan 2025 22:04:27 +0900 Subject: [PATCH 3/4] Format using clangd@17 Signed-off-by: Wish --- .../Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index f6639bcc3f5..3060759fc97 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -937,10 +937,9 @@ SegmentPtr SegmentTestBasic::reload( storage_path_pool = std::make_shared(db_context->getPathPool().withTable("test", "t1", false)); storage_pool = std::make_shared(*db_context, NullspaceID, NAMESPACE_ID, *storage_path_pool, "test.t1"); storage_pool->restore(); - ColumnDefinesPtr cols = (!pre_define_columns) - ? DMTestEnv::getDefaultColumns( - is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) - : pre_define_columns; + ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns( + is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) + : pre_define_columns; prepareColumns(cols); setColumns(cols); From 8a5b158539ac365a5b2357ae3eef755cab63680d Mon Sep 17 00:00:00 2001 From: Wish Date: Wed, 8 Jan 2025 22:16:07 +0900 Subject: [PATCH 4/4] Fix perf_stat Signed-off-by: Wish --- dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp index a687ff3ec73..6ab7ce7e386 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp +++ b/dbms/src/Storages/DeltaMerge/File/DMFileVectorIndexReader.cpp @@ -67,6 +67,7 @@ void DMFileVectorIndexReader::loadVectorIndex() RUNTIME_CHECK(vector_index.has_value(), col_id, index_id); RUNTIME_CHECK(vector_index->index_props().kind() == dtpb::IndexFileKind::VECTOR_INDEX); RUNTIME_CHECK(vector_index->index_props().has_vector_index()); + perf_stat.index_size = vector_index->index_props().file_size(); // If local file is invalidated, cache is not valid anymore. So we // need to ensure file exists on local fs first.