Skip to content

Commit

Permalink
[#23376] DocDB: Utilities needed for HNSW
Browse files Browse the repository at this point in the history
Summary:
Some utilities needed for the HNSW vector index implementation and benchmarking.

Adding a new directory, "vector", and the new library yb_vector. The namespace is called vectorindex.

benchmark_data.{h,cc} -- implements readers for the .fvec file format (see http://corpus-texmex.irisa.fr/).

distance.{h,cc} -- functions for distance calculation, currently for only for L2 squared and cosine.

vector_index_if.h -- intended to contain high-level interfaces exposed by a vector index such as HNSW. Currently only the reader API is included, which will be needed by the recall computation utility.

hnsw_util.{h,cc} -- various types and functions needed in the HNSW implementation: level selection, and min/max priority queues for (vector, distance) pairs.

The vector_types.h header in the common directory is needed by the dockv code, so it can't be in the vector directory. The yb_dockv library is not allowed to depend on the yb_vector library.
Jira: DB-12298

Test Plan: Jenkins

Reviewers: sergei, aleksandr.ponomarenko

Reviewed By: sergei

Subscribers: ybase

Differential Revision: https://phorge.dev.yugabyte.com/D37340
  • Loading branch information
mbautin committed Aug 17, 2024
1 parent 16941de commit 404075d
Show file tree
Hide file tree
Showing 19 changed files with 710 additions and 28 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,7 @@ set(YB_SUBDIR_NAMES
tools
tserver
util
vector
yql
)

Expand Down
27 changes: 27 additions & 0 deletions src/yb/common/vector_types.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) YugabyteDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations
// under the License.
//

// Typedefs of vector types used by the vector indexing library. These are simple typedefs for
// std::vector, and they are used in dockv, which is not allowed to depend on the yb_vector
// library.
#pragma once

#include <cstdint>
#include <vector>

namespace yb {

using FloatVector = std::vector<float>;
using UInt64Vector = std::vector<uint64_t>;

} // namespace yb
35 changes: 18 additions & 17 deletions src/yb/docdb/usearch_vector_index-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,21 @@
// under the License.
//

#include <cassert>

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <random>

#include "yb/util/logging.h"
#include "yb/util/monotime.h"
#include "yb/util/random_util.h"
#include "yb/util/test_thread_holder.h"
#include "yb/util/test_util.h"
#include "yb/util/tsan_util.h"

#pragma GCC diagnostic push

#ifdef __clang__
Expand All @@ -28,20 +43,6 @@

#pragma GCC diagnostic pop

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <random>
#include <cassert>

#include "yb/util/logging.h"
#include "yb/util/monotime.h"
#include "yb/util/random_util.h"
#include "yb/util/test_thread_holder.h"
#include "yb/util/test_util.h"
#include "yb/util/tsan_util.h"

// Helper function to generate random vectors
template<typename Distribution>
std::vector<float> GenerateRandomVector(size_t dimensions, Distribution& dis) {
Expand All @@ -68,7 +69,7 @@ void ReportPerf(
}

TEST_F(UsearchVectorIndexTest, CreateAndQuery) {
using namespace unum::usearch;
using namespace unum::usearch; // NOLINT

// Create a metric and index
const size_t kDimensions = 96;
Expand All @@ -94,7 +95,7 @@ TEST_F(UsearchVectorIndexTest, CreateAndQuery) {
std::random_device rd;
size_t vector_id;
while ((vector_id = num_vectors_inserted.fetch_add(1)) < kNumVectors) {
auto vec = GenerateRandomVector(kDimensions, uniform_distrib);
auto vec = RandomFloatVector(kDimensions, uniform_distrib);
ASSERT_TRUE(index.add(vector_id, vec.data()));
}
latch.CountDown();
Expand Down Expand Up @@ -147,6 +148,6 @@ TEST_F(UsearchVectorIndexTest, CreateAndQuery) {
auto query_elapsed_usec = (MonoTime::Now() - query_start_time).ToMicroseconds();
ReportPerf("Performed", kNumQueries, "queries", kDimensions, query_elapsed_usec,
kNumIndexingThreads);
};
}

} // namespace yb
9 changes: 5 additions & 4 deletions src/yb/docdb/vector_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
#include "yb/dockv/key_bytes.h"
#include "yb/dockv/primitive_value.h"

#include "yb/vector/graph_repr_defs.h"

namespace yb::docdb {

using VertexId = uint64_t;
using VectorIndexLevel = uint8_t;
using VectorNodeNeighbors = std::set<VertexId>;
constexpr VertexId kInvalidVertexId = 0;
using VertexId = vectorindex::VertexId;
using VectorIndexLevel = vectorindex::VectorIndexLevel;
using VectorNodeNeighbors = vectorindex::VectorNodeNeighbors;

template <class CoordinateType>
struct VectorIndexTypes {
Expand Down
6 changes: 4 additions & 2 deletions src/yb/docdb/vector_index_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "yb/docdb/vector_index_read.h"

#include "yb/vector/graph_repr_defs.h"

#include "yb/docdb/docdb_rocksdb_util.h"
#include "yb/docdb/intent_aware_iterator.h"

Expand Down Expand Up @@ -55,7 +57,7 @@ class VectorIndexRead {
return dockv::PrimitiveValue::DecodeFloatVector(kv.value);
}

Result<VectorNodeNeighbors> GetNeighbors(VertexId id, VectorIndexLevel level) {
Result<vectorindex::VectorNodeNeighbors> GetNeighbors(VertexId id, VectorIndexLevel level) {
auto vertex_level_key_bytes = MakeVectorIndexKey(id, level);
auto vertex_level_key = vertex_level_key_bytes.AsSlice();
iter_->Seek(vertex_level_key);
Expand All @@ -81,7 +83,7 @@ class VectorIndexRead {
}
}

auto prev_vertex_id = kInvalidVertexId;
auto prev_vertex_id = vectorindex::kInvalidVertexId;
auto vertex_level_key_size = vertex_level_key.size();
while (kv && kv.key.starts_with(vertex_level_key)) {
if (kv.write_time < full_write_time) {
Expand Down
4 changes: 3 additions & 1 deletion src/yb/docdb/vector_index_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#pragma once

#include "yb/vector/graph_repr_defs.h"

#include "yb/docdb/docdb_fwd.h"
#include "yb/docdb/key_bounds.h"
#include "yb/docdb/vector_index.h"
Expand All @@ -29,7 +31,7 @@ class VectorIndexStorage : public VectorIndexFetcher<CoordinateType> {

Result<IndexedVector> GetVector(
const ReadOperationData& read_operation_data, VertexId id) override;
Result<VectorNodeNeighbors> GetNeighbors(
Result<vectorindex::VectorNodeNeighbors> GetNeighbors(
const ReadOperationData& read_operation_data, VertexId id, VectorIndexLevel level) override;
private:
const DocDB doc_db_;
Expand Down
2 changes: 1 addition & 1 deletion src/yb/docdb/vector_index_update.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void VectorIndexUpdate<CoordinateType>::SetNeighbors(
write_batch_.Put(
MakeKey(id, level),
dockv::PrimitiveValue::Encoded(
dockv::UInt64Vector{new_neighbors.begin(), new_neighbors.end()}).AsSlice());
UInt64Vector{new_neighbors.begin(), new_neighbors.end()}).AsSlice());

GetLevel(id, level).neighbors = std::move(new_neighbors);
}
Expand Down
5 changes: 2 additions & 3 deletions src/yb/dockv/primitive_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "yb/common/hybrid_time.h"
#include "yb/common/ql_datatype.h"

#include "yb/common/vector_types.h"

#include "yb/dockv/dockv_fwd.h"

#include "yb/util/algorithm_util.h"
Expand All @@ -47,9 +49,6 @@ YB_DEFINE_ENUM(ListExtendOrder, (APPEND)(PREPEND_BLOCK)(PREPEND))
// A necessary use of a forward declaration to avoid circular inclusion.
class SubDocument;

using FloatVector = std::vector<float>;
using UInt64Vector = std::vector<uint64_t>;

class PrimitiveValue {
public:
static const PrimitiveValue kInvalid;
Expand Down
9 changes: 9 additions & 0 deletions src/yb/util/random_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,13 @@ typename Collection::const_reference RandomElement(const Collection& collection,

std::string RandomHumanReadableString(size_t len, std::mt19937_64* rng = nullptr);

template<typename Distribution>
std::vector<float> RandomFloatVector(size_t dimensions, Distribution& dis) {
std::vector<float> vec(dimensions);
for (auto& v : vec) {
v = dis(ThreadLocalRandom());
}
return vec;
}

} // namespace yb
35 changes: 35 additions & 0 deletions src/yb/vector/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) YugaByteDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations
# under the License.
#

set(YB_PCH_PREFIX vector)

# Vector indexing, approximate nearest neighbor search, HNSW.

set(VECTOR_SRCS
hnsw_util.cc
distance.cc
benchmark_data.cc)

set(VECTOR_LIBS
yb_common
yb_util
yb_docdb
)

ADD_YB_LIBRARY(yb_vector
SRCS ${VECTOR_SRCS}
DEPS ${VECTOR_LIBS})

set(YB_TEST_LINK_LIBS
yb_vector yb_docdb_test_common yb_common_test_util ${YB_MIN_TEST_LIBS})
ADD_YB_TEST(hnsw_util-test)
113 changes: 113 additions & 0 deletions src/yb/vector/benchmark_data.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright (c) YugabyteDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations
// under the License.
//

#include "yb/vector/benchmark_data.h"

#include "yb/util/env.h"
#include "yb/util/errno.h"
#include "yb/util/random_util.h"

namespace yb::vectorindex {

std::unique_ptr<UniformRandomFloatVectorGenerator> CreateUniformRandomVectorSource(
size_t num_vectors, size_t dimensions, float min_value, float max_value) {
return std::make_unique<UniformRandomFloatVectorGenerator>(
num_vectors, dimensions, UniformRandomFloatDistribution(min_value, max_value));
}

FvecsFileReader::FvecsFileReader(const std::string& file_path) : file_path_(file_path) {
}

FvecsFileReader::~FvecsFileReader() {
if (input_file_) {
fclose(input_file_);
input_file_ = nullptr;
}
}

Result<FloatVector> FvecsFileReader::Next() {
if (current_index_ >= num_points_) {
return FloatVector();
}
RETURN_NOT_OK(ReadVectorInternal(/* validate_dimension_field= */ current_index_ > 0));
current_index_++;
return current_vector_;
}

Status FvecsFileReader::Open() {
auto file_size = VERIFY_RESULT(Env::Default()->GetFileSize(file_path_));
SCHECK_GE(file_size, sizeof(uint32_t), IOError,
Format("File size too small for $0: $1", file_path_, file_size));

input_file_ = fopen(file_path_.c_str(), "rb");
if (!input_file_) {
return STATUS_FROM_ERRNO(Format("Error opening file: $0", file_path_), errno);
}

dimensions_ = VERIFY_RESULT(ReadDimensionsField());
if (dimensions_ < 1 || dimensions_ >= 1000000) {
return STATUS_FORMAT(
InvalidArgument, "Invalid number of dimensions from file $0: $1",
file_path_, dimensions_);
}

// From http://corpus-texmex.irisa.fr/:
// .bvecs, .fvecs and .ivecs vector file formats:

// The vectors are stored in raw little endian.
// Each vector takes 4+d*4 bytes for .fvecs and .ivecs formats, and 4+d bytes for .bvecs
// formats, where d is the dimensionality of the vector, as shown below.
auto bytes_stored_per_vector = sizeof(uint32_t) + dimensions_ * kCoordinateSize;
if (file_size % bytes_stored_per_vector != 0) {
return STATUS_FORMAT(
IOError,
"fvecs file format error: file size of $0 ($1) is not divisible by $2",
file_path_, file_size, bytes_stored_per_vector);
}
num_points_ = file_size / bytes_stored_per_vector;
current_vector_.resize(dimensions_);
current_index_ = 0;
return Status::OK();
}

std::string FvecsFileReader::ToString() const {
return Format(
"fvecs file $0, $1 points, $2 dimensions", file_path_, num_points_, dimensions_);
}

Result<size_t> FvecsFileReader::ReadDimensionsField() {
uint32_t ndims_u32;
if (fread(&ndims_u32, sizeof(uint32_t), 1, input_file_) != 1) {
return STATUS_FROM_ERRNO(
Format("Error reading the number of dimensions from file $0", file_path_), errno);
}
return ndims_u32;
}

Status FvecsFileReader::ReadVectorInternal(bool validate_dimension_field) {
if (validate_dimension_field) {
auto dims = VERIFY_RESULT(ReadDimensionsField());
if (dims != dimensions_) {
return STATUS_FORMAT(
IllegalState, "Invalid number of dimensions in vector #$0 of file $1: $2 (expected $3)",
current_index_, file_path_, dims, dimensions_);
}
}
if (fread(current_vector_.data(), sizeof(float), dimensions_, input_file_) != dimensions_) {
return STATUS_FROM_ERRNO(
Format("Error reading vector #$0 from file $1", current_index_, file_path_), errno);
}
return Status::OK();
}

} // namespace yb::vectorindex
Loading

0 comments on commit 404075d

Please sign in to comment.