Skip to content

Commit

Permalink
VectorIndexDefinition: Adds Support for Partitioned DiskANN (#4792)
Browse files Browse the repository at this point in the history
# Pull Request Template

## Description

This PR adds optional attributes in the `VectorIndexDefinition` class to
support partitioned DiskANN. A typical index definition would be
something like the below:

```
{
    "indexingPolicy": {
        "automatic": true,
        "indexingMode": "Consistent",
        "includedPaths": [
            {
                "path": "/*",
                "indexes": []
            }
        ],
        "excludedPaths": [],
        "compositeIndexes": [],
        "spatialIndexes": [],
        "vectorIndexes": [
            {
                "path": "/vector1",
                "type": "flat"
            },
            {
                "path": "/vector2",
                "type": "quantizedFlat",
                "quantizationByteSize": 3,
                "vectorIndexShardKey": [
                    "/Country"
                ]
            },
            {
                "path": "/vector3",
                "type": "diskANN",
                "quantizationByteSize": 2,
                "indexingSearchListSize": 100,
                "vectorIndexShardKey": [
                    "/ZipCode"
                ]
            }
        ]
    },
    "vectorEmbeddingPolicy": {
        "vectorEmbeddings": [
            {
                "path": "/vector1",
                "dataType": "int8",
                "dimensions": 1200,
                "distanceFunction": "dotproduct"
            },
            {
                "path": "/vector2",
                "dataType": "uint8",
                "dimensions": 3,
                "distanceFunction": "cosine"
            },
            {
                "path": "/vector3",
                "dataType": "float32",
                "dimensions": 400,
                "distanceFunction": "euclidean"
            }
        ]
    },
    "id": "test_binary_vector_container_6",
    "partitionKey": {
        "paths": [
            "/pk"
        ],
        "kind": "Hash"
    }
}
```

## Type of change

Please delete options that are not relevant.

- [x] New feature (non-breaking change which adds functionality)

## Closing issues

To automatically close an issue: closes #4628

---------

Co-authored-by: Kiran Kumar Kolli <[email protected]>
  • Loading branch information
2 people authored and sourabh1007 committed Oct 22, 2024
1 parent a3ee34d commit de96acc
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,51 @@ public VectorIndexDefinition<T> Path(
return this;
}

/// <summary>
/// Configures the quantization byte size for the current <see cref="VectorIndexPath"/> definition.
/// </summary>
/// <param name="quantizationByteSize">
/// The number of bytes used in product quantization of the vectors. This is an optional parameter and applies to index
/// types DiskANN and quantizedFlat. Note that, the allowed range for this parameter is between 1 and 3.
/// </param>
/// <returns>An instance of the current <see cref="VectorIndexDefinition{T}"/>.</returns>
public VectorIndexDefinition<T> WithQuantizationByteSize(
int quantizationByteSize)
{
this.vectorIndexPath.QuantizationByteSize = quantizationByteSize;
return this;
}

/// <summary>
/// Configures the indexing search list size for the current <see cref="VectorIndexPath"/> definition.
/// </summary>
/// <param name="indexingSearchListSize">
/// This represents the size of the candidate list of approximate neighbors stored while building the DiskANN index as part of the optimization processes.
/// This is an optional parameter and applies to index type DiskANN only. The allowed range for this parameter is between 25 and 500.
/// </param>
/// <returns>An instance of the current <see cref="VectorIndexDefinition{T}"/>.</returns>
public VectorIndexDefinition<T> WithIndexingSearchListSize(
int indexingSearchListSize)
{
this.vectorIndexPath.IndexingSearchListSize = indexingSearchListSize;
return this;
}

/// <summary>
/// Configures the vector index shard key for the current <see cref="VectorIndexPath"/> definition.
/// </summary>
/// <param name="vectorIndexShardKey">
/// A string array containing the shard keys used for partitioning the vector indexes. This is an optional parameter and
/// applies to index types DiskANN and quantizedFlat.
/// </param>
/// <returns>An instance of the current <see cref="VectorIndexDefinition{T}"/>.</returns>
public VectorIndexDefinition<T> WithVectorIndexShardKey(
string[] vectorIndexShardKey)
{
this.vectorIndexPath.VectorIndexShardKey = vectorIndexShardKey ?? throw new ArgumentNullException(nameof(vectorIndexShardKey));
return this;
}

/// <summary>
/// Applies the current definition to the parent.
/// </summary>
Expand Down
4 changes: 2 additions & 2 deletions Microsoft.Azure.Cosmos/src/Resource/Settings/Embedding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ class Embedding : IEquatable<Embedding>
public VectorDataType DataType { get; set; }

/// <summary>
/// Gets or sets a long integer representing the dimensions of a vector.
/// Gets or sets an integer representing the dimensions of a vector.
/// </summary>
[JsonProperty(PropertyName = "dimensions")]
public ulong Dimensions { get; set; }
public int Dimensions { get; set; }

/// <summary>
/// Gets or sets the <see cref="Cosmos.DistanceFunction"/> which is used to calculate the respective distance between the vectors.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@ namespace Microsoft.Azure.Cosmos
#endif
enum VectorDataType
{
/// <summary>
/// Represent a float16 data type.
/// </summary>
[EnumMember(Value = "float16")]
Float16,

/// <summary>
/// Represent a float32 data type.
/// </summary>
Expand Down
43 changes: 41 additions & 2 deletions Microsoft.Azure.Cosmos/src/Resource/Settings/VectorIndexPath.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,16 @@ namespace Microsoft.Azure.Cosmos
/// },
/// {
/// "path": "/vector2",
/// "type": "flat"
/// "type": "quantizedFlat",
/// "quantizationByteSize": 3,
/// "vectorIndexShardKey": ["/ZipCode"]
/// },
/// {
/// "path": "/embeddings/vector",
/// "type": "flat"
/// "type": "DiskANN",
/// "quantizationByteSize": 2,
/// "indexingSearchListSize": 100,
/// "vectorIndexShardKey": ["/Country"]
/// }
/// ]
/// }
Expand All @@ -49,6 +54,12 @@ namespace Microsoft.Azure.Cosmos
#endif
sealed class VectorIndexPath
{
[JsonProperty(PropertyName = "indexingSearchListSize", NullValueHandling = NullValueHandling.Ignore)]
private int? indexingSearchListSizeInternal;

[JsonProperty(PropertyName = "quantizationByteSize", NullValueHandling = NullValueHandling.Ignore)]
private int? quantizationByteSizeInternal;

/// <summary>
/// Gets or sets the full path in a document used for vector indexing.
/// </summary>
Expand All @@ -62,6 +73,34 @@ sealed class VectorIndexPath
[JsonConverter(typeof(StringEnumConverter))]
public VectorIndexType Type { get; set; }

/// <summary>
/// Gets or sets the quantization byte size for the vector index path. This is only applicable for the quantizedFlat and diskann vector index types.
/// The allowed range for this parameter is between 1 and 3.
/// </summary>
[JsonIgnore]
public int QuantizationByteSize
{
get => this.quantizationByteSizeInternal == null ? 0 : this.quantizationByteSizeInternal.Value;
set => this.quantizationByteSizeInternal = value;
}

/// <summary>
/// Gets or sets the indexing search list size for the vector index path. This is only applicable for the diskann vector index type.
/// The allowed range for this parameter is between 25 and 500.
/// </summary>
[JsonIgnore]
public int IndexingSearchListSize
{
get => this.indexingSearchListSizeInternal == null ? 0 : this.indexingSearchListSizeInternal.Value;
set => this.indexingSearchListSizeInternal = value;
}

/// <summary>
/// Gets or sets the vector index shard key for the vector index path. This is only applicable for the quantizedFlat and diskann vector index types.
/// </summary>
[JsonProperty(PropertyName = "vectorIndexShardKey", NullValueHandling = NullValueHandling.Ignore)]
public string[] VectorIndexShardKey { get; set; }

/// <summary>
/// This contains additional values for scenarios where the SDK is not aware of new fields.
/// This ensures that if resource is read and updated none of the fields will be lost in the process.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -585,10 +585,15 @@ await databaseForVectorEmbedding.DefineContainer(containerName, partitionKeyPath
.Path(vector1Path, VectorIndexType.Flat)
.Attach()
.WithVectorIndex()
.Path(vector2Path, VectorIndexType.Flat)
.Path(vector2Path, VectorIndexType.QuantizedFlat)
.WithQuantizationByteSize(3)
.WithVectorIndexShardKey(new string[] { "/Country" })
.Attach()
.WithVectorIndex()
.Path(vector3Path, VectorIndexType.Flat)
.Path(vector3Path, VectorIndexType.DiskANN)
.WithQuantizationByteSize(2)
.WithIndexingSearchListSize(5)
.WithVectorIndexShardKey(new string[] { "/ZipCode" })
.Attach()
.Attach()
.CreateAsync();
Expand All @@ -610,9 +615,14 @@ await databaseForVectorEmbedding.DefineContainer(containerName, partitionKeyPath
Assert.AreEqual(vector1Path, containerSettings.IndexingPolicy.VectorIndexes[0].Path);
Assert.AreEqual(VectorIndexType.Flat, containerSettings.IndexingPolicy.VectorIndexes[0].Type);
Assert.AreEqual(vector2Path, containerSettings.IndexingPolicy.VectorIndexes[1].Path);
Assert.AreEqual(VectorIndexType.Flat, containerSettings.IndexingPolicy.VectorIndexes[1].Type);
Assert.AreEqual(VectorIndexType.QuantizedFlat, containerSettings.IndexingPolicy.VectorIndexes[1].Type);
Assert.AreEqual(3, containerSettings.IndexingPolicy.VectorIndexes[1].QuantizationByteSize);
CollectionAssert.AreEqual(new string[] { "/Country" }, containerSettings.IndexingPolicy.VectorIndexes[1].VectorIndexShardKey);
Assert.AreEqual(vector3Path, containerSettings.IndexingPolicy.VectorIndexes[2].Path);
Assert.AreEqual(VectorIndexType.Flat, containerSettings.IndexingPolicy.VectorIndexes[2].Type);
Assert.AreEqual(VectorIndexType.DiskANN, containerSettings.IndexingPolicy.VectorIndexes[2].Type);
Assert.AreEqual(2, containerSettings.IndexingPolicy.VectorIndexes[2].QuantizationByteSize);
Assert.AreEqual(5, containerSettings.IndexingPolicy.VectorIndexes[2].IndexingSearchListSize);
CollectionAssert.AreEqual(new string[] { "/ZipCode" }, containerSettings.IndexingPolicy.VectorIndexes[2].VectorIndexShardKey);
}
finally
{
Expand Down
Loading

0 comments on commit de96acc

Please sign in to comment.