Skip to content

Commit

Permalink
Streaming PQ (#689)
Browse files Browse the repository at this point in the history

---------

Co-authored-by: Chang She <[email protected]>
  • Loading branch information
eddyxu and changhiskhan authored Mar 17, 2023
1 parent d4101ff commit 299af4a
Show file tree
Hide file tree
Showing 10 changed files with 737 additions and 493 deletions.
96 changes: 48 additions & 48 deletions benchmarks/sift/lance_sift1m_stats.csv
Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
ivf,pq,nprobes,nsamples,topk,refine_factor,recall@k,mean_time_sec
512,16,1,100,10,,0.561,0.0011869692802429
512,16,1,100,10,1.0,0.561,0.0011151361465454
512,16,1,100,10,5.0,0.86,0.0013169264793395
512,16,1,100,10,10.0,0.7559999999999999,0.0016109180450439
512,16,10,100,10,,0.6409999999999999,0.0017412400245666
512,16,10,100,10,1.0,0.653,0.0018005084991455
512,16,10,100,10,5.0,0.954,0.0020255255699157
512,16,10,100,10,10.0,0.984,0.0023340821266174
512,16,50,100,10,,0.655,0.0047774505615234
512,16,50,100,10,1.0,0.639,0.0047786116600036
512,16,50,100,10,5.0,0.9700000000000002,0.005054075717926
512,16,50,100,10,10.0,0.996,0.0054328942298889
512,16,100,100,10,,0.6829999999999998,0.0095507001876831
512,16,100,100,10,1.0,0.6609999999999999,0.0084998655319213
512,16,100,100,10,5.0,0.9690000000000002,0.0087792038917541
512,16,100,100,10,10.0,0.985,0.009198293685913
1024,16,1,100,10,,0.5770000000000001,0.001229350566864
1024,16,1,100,10,1.0,0.57,0.0013754272460937
1024,16,1,100,10,5.0,0.714,0.0015950870513916
1024,16,1,100,10,10.0,0.7800000000000001,0.0019168639183044
1024,16,10,100,10,,0.665,0.0017744970321655
1024,16,10,100,10,1.0,0.661,0.0018366694450378
1024,16,10,100,10,5.0,0.95,0.0021376228332519
1024,16,10,100,10,10.0,0.973,0.0024430847167968
1024,16,50,100,10,,0.649,0.0042504262924194
1024,16,50,100,10,1.0,0.632,0.0042378640174865
1024,16,50,100,10,5.0,0.9640000000000002,0.0045270037651062
1024,16,50,100,10,10.0,0.996,0.0049609613418579
1024,16,100,100,10,,0.6819999999999999,0.0072804021835327
1024,16,100,100,10,1.0,0.686,0.007310128211975
1024,16,100,100,10,5.0,0.981,0.0076206445693969
1024,16,100,100,10,10.0,0.996,0.0079639410972595
2048,16,1,100,10,,0.59,0.0016197514533996
2048,16,1,100,10,1.0,0.546,0.0016238856315612
2048,16,1,100,10,5.0,0.674,0.0018658971786499
2048,16,1,100,10,10.0,0.6980000000000001,0.0021486544609069
2048,16,10,100,10,,0.6679999999999998,0.0020835614204406
2048,16,10,100,10,1.0,0.69,0.0021107578277587
2048,16,10,100,10,5.0,0.9460000000000002,0.0024298906326293
2048,16,10,100,10,10.0,0.966,0.0026938366889953
2048,16,50,100,10,,0.6350000000000001,0.0042195105552673
2048,16,50,100,10,1.0,0.6809999999999999,0.0042567706108093
2048,16,50,100,10,5.0,0.963,0.004510052204132
2048,16,50,100,10,10.0,0.9920000000000002,0.0048563075065612
2048,16,100,100,10,,0.6779999999999998,0.0068572688102722
2048,16,100,100,10,1.0,0.6729999999999998,0.0068558740615844
2048,16,100,100,10,5.0,0.975,0.0071770000457763
2048,16,100,100,10,10.0,0.995,0.0075724363327026
512,16,1,100,10,,0.5459999999999999,0.0012434267997741
512,16,1,100,10,1.0,0.573,0.0012985253334045
512,16,1,100,10,5.0,0.7799999999999998,0.0016234064102172
512,16,1,100,10,10.0,0.81,0.0021506214141845
512,16,10,100,10,,0.643,0.0019894814491271
512,16,10,100,10,1.0,0.6429999999999998,0.0020099234580993
512,16,10,100,10,5.0,0.968,0.0023579096794128
512,16,10,100,10,10.0,0.975,0.0028011083602905
512,16,50,100,10,,0.627,0.005469377040863
512,16,50,100,10,1.0,0.647,0.0049861502647399
512,16,50,100,10,5.0,0.972,0.0052753090858459
512,16,50,100,10,10.0,0.995,0.0065333843231201
512,16,100,100,10,,0.6499999999999999,0.0086667680740356
512,16,100,100,10,1.0,0.6829999999999998,0.0090697479248046
512,16,100,100,10,5.0,0.973,0.0090816569328308
512,16,100,100,10,10.0,0.992,0.0094730043411254
1024,16,1,100,10,,0.5720000000000001,0.0014924383163452
1024,16,1,100,10,1.0,0.5980000000000001,0.0016877269744873
1024,16,1,100,10,5.0,0.7610000000000001,0.0017683720588684
1024,16,1,100,10,10.0,0.7209999999999998,0.002867727279663
1024,16,10,100,10,,0.6729999999999998,0.0020465373992919
1024,16,10,100,10,1.0,0.664,0.0019461870193481
1024,16,10,100,10,5.0,0.975,0.0023498964309692
1024,16,10,100,10,10.0,0.9870000000000002,0.0030326318740844
1024,16,50,100,10,,0.669,0.0047199487686157
1024,16,50,100,10,1.0,0.6560000000000001,0.0045324063301086
1024,16,50,100,10,5.0,0.9740000000000002,0.0048948669433593
1024,16,50,100,10,10.0,0.988,0.0055017662048339
1024,16,100,100,10,,0.6579999999999999,0.0077169895172119
1024,16,100,100,10,1.0,0.6580000000000001,0.0077496647834777
1024,16,100,100,10,5.0,0.969,0.0079165744781494
1024,16,100,100,10,10.0,0.991,0.008975670337677
2048,16,1,100,10,,0.5010000000000001,0.0020228648185729
2048,16,1,100,10,1.0,0.494,0.0022545051574707
2048,16,1,100,10,5.0,0.627,0.0024846982955932
2048,16,1,100,10,10.0,0.7119999999999999,0.0031368565559387
2048,16,10,100,10,,0.6539999999999999,0.0028012990951538
2048,16,10,100,10,1.0,0.6549999999999998,0.002736701965332
2048,16,10,100,10,5.0,0.948,0.0028632616996765
2048,16,10,100,10,10.0,0.97,0.0035367107391357
2048,16,50,100,10,,0.662,0.0047123503684997
2048,16,50,100,10,1.0,0.7040000000000001,0.0048845314979553
2048,16,50,100,10,5.0,0.974,0.005355429649353
2048,16,50,100,10,10.0,0.997,0.0055468249320983
2048,16,100,100,10,,0.6699999999999998,0.0074122834205627
2048,16,100,100,10,1.0,0.6629999999999999,0.0074411630630493
2048,16,100,100,10,5.0,0.9770000000000002,0.0080564498901367
2048,16,100,100,10,10.0,0.996,0.0083680939674377
19 changes: 19 additions & 0 deletions protos/index.proto
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,23 @@ message PQ {
repeated float codebook = 4;
}

// Transform type
enum TransformType {
OPQ = 0;
}

// A transform matrix to apply to a vector or vectors.
message Transform {
// The file offset the matrix is stored
uint64 position = 1;

// Data shape of the matrix, [rows, cols].
repeated uint32 shape = 2;

// Transform type.
TransformType type = 3;
}

// Flat Index
message Flat {}

Expand All @@ -84,6 +101,8 @@ message VectorIndexStage {
IVF ivf = 2;
// Product Quantization
PQ pq = 3;
// Transformer
Transform transform = 4;
}
}

Expand Down
2 changes: 1 addition & 1 deletion rust/src/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ pub trait GenericListArrayExt<Offset: ArrowNumericType>
where
Offset::Native: OffsetSizeTrait,
{
/// Create an [`ListArray`] from values and offsets.
/// Create an [`GenericListArray`] from values and offsets.
///
/// ```
/// use arrow_array::{Int32Array, Int64Array, ListArray};
Expand Down
31 changes: 30 additions & 1 deletion rust/src/arrow/linalg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ use arrow::{
array::{as_primitive_array, Float32Builder},
datatypes::Float32Type,
};
use arrow_array::{Array, Float32Array};
use arrow_array::{Array, FixedSizeListArray, Float32Array};
use arrow_schema::DataType;
use rand::{distributions::Standard, rngs::SmallRng, seq::IteratorRandom, Rng, SeedableRng};

#[allow(unused_imports)]
Expand Down Expand Up @@ -130,6 +131,15 @@ impl MatrixView {
}
}

pub fn row(&self, i: usize) -> Option<Float32Array> {
if i >= self.num_rows() {
None
} else {
let slice_arr = self.data.slice(i * self.num_columns(), self.num_columns());
Some(as_primitive_array(slice_arr.as_ref()).clone())
}
}

/// (Lazy) transpose of the matrix.
///
pub fn transpose(&self) -> Self {
Expand Down Expand Up @@ -225,6 +235,25 @@ impl MatrixView {
}
}

impl TryFrom<&FixedSizeListArray> for MatrixView {
type Error = Error;

fn try_from(fsl: &FixedSizeListArray) -> Result<Self> {
if !matches!(fsl.value_type(), DataType::Float32) {
return Err(Error::Arrow(format!(
"Only support convert f32 FixedSizeListArray to MatrixView, got {}",
fsl.data_type()
)));
}
let values = fsl.values();
Ok(Self {
data: Arc::new(as_primitive_array(values.as_ref()).clone()),
num_columns: fsl.value_length() as usize,
transpose: false,
})
}
}

/// Single Value Decomposition.
///
/// <https://en.wikipedia.org/wiki/Singular_value_decomposition>
Expand Down
2 changes: 1 addition & 1 deletion rust/src/bin/lq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ async fn create_index(
&[&col],
lance::index::IndexType::Vector,
name.clone(),
&VectorIndexParams::ivf_pq(*num_partitions, 8, *num_sub_vectors, mt),
&VectorIndexParams::ivf_pq(*num_partitions, 8, *num_sub_vectors, true, mt, 50),
false,
)
.await
Expand Down
43 changes: 30 additions & 13 deletions rust/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,8 @@ use self::scanner::Scanner;
use crate::arrow::*;
use crate::datatypes::Schema;
use crate::format::{pb, Fragment, Index, Manifest};
use crate::index::{
vector::{ivf::IvfPqIndexBuilder, VectorIndexParams},
IndexBuilder, IndexParams, IndexType,
};
use crate::index::vector::ivf::{build_ivf_pq_index, IvfBuildParams, PQBuildParams};
use crate::index::{vector::VectorIndexParams, IndexParams, IndexType};
use crate::io::{
object_reader::{read_message, read_struct},
read_manifest, read_metadata_offset, write_manifest, FileReader, FileWriter, ObjectStore,
Expand Down Expand Up @@ -290,7 +288,7 @@ impl Dataset {
let base = object_store.base_path().clone();
Ok(Self {
object_store,
base,
base: base.into(),
manifest: Arc::new(manifest.clone()),
})
}
Expand Down Expand Up @@ -394,16 +392,27 @@ impl Dataset {
}
}

let builder = IvfPqIndexBuilder::try_new(
let ivf_params = IvfBuildParams {
num_partitions: vec_params.num_partitions as usize,
metric_type: vec_params.metric_type,
max_iters: vec_params.max_iterations,
};
let pq_params = PQBuildParams {
num_sub_vectors: vec_params.num_sub_vectors as usize,
num_bits: 8,
metric_type: vec_params.metric_type,
use_opq: vec_params.use_opq,
max_iters: vec_params.max_iterations,
};
build_ivf_pq_index(
self,
index_id,
&index_name,
column,
vec_params.num_partitions,
vec_params.num_sub_vectors,
vec_params.metric_type,
)?;
builder.build().await?
&index_name,
&index_id,
&ivf_params,
&pq_params,
)
.await?
}
}

Expand Down Expand Up @@ -534,6 +543,14 @@ impl Dataset {
Ok(as_struct_array(&reordered).into())
}

/// Sample `n` rows from the dataset.
pub(crate) async fn sample(&self, n: usize, projection: &Schema) -> Result<RecordBatch> {
use rand::seq::IteratorRandom;
let num_rows = self.count_rows().await?;
let ids = (0..num_rows).choose_multiple(&mut rand::thread_rng(), n);
Ok(self.take(&ids[..], &projection).await?)
}

pub(crate) fn object_store(&self) -> &ObjectStore {
&self.object_store
}
Expand Down
38 changes: 37 additions & 1 deletion rust/src/index/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,15 @@ mod kmeans;
mod opq;
mod pq;

use super::IndexParams;
use super::{pb, IndexParams};
use crate::{
arrow::linalg::MatrixView,
utils::distance::{cosine::cosine_distance, l2::l2_distance},
Error, Result,
};

const MAX_ITERATIONS: usize = 50;

/// Query parameters for the vector indices
#[derive(Debug, Clone)]
pub struct Query {
Expand Down Expand Up @@ -79,6 +82,26 @@ pub trait VectorIndex {
async fn search(&self, query: &Query) -> Result<RecordBatch>;
}

/// Transformer on vectors.
#[async_trait]
pub trait Transformer: std::fmt::Debug + Sync + Send {
/// Train the transformer.
///
/// Parameters:
/// - *data*: training vectors.
async fn train(&mut self, data: &MatrixView) -> Result<()>;

/// Apply transform on the matrix `data`.
///
/// Returns a new Matrix instead.
async fn transform(&self, data: &MatrixView) -> Result<MatrixView>;

/// Try to convert into protobuf.
///
/// TODO: can we use TryFrom/TryInto as trait constrats?
fn try_into_pb(&self) -> Result<pb::Transform>;
}

/// Distance metrics type.
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum MetricType {
Expand Down Expand Up @@ -150,11 +173,17 @@ pub struct VectorIndexParams {
/// the number of bits to present the centroids used in PQ.
pub nbits: u8,

/// Use Optimized Product Quantizer.
pub use_opq: bool,

/// the number of sub vectors used in PQ.
pub num_sub_vectors: u32,

/// Vector distance metrics type.
pub metric_type: MetricType,

/// Max number of iterations to train a KMean model
pub max_iterations: usize,
}

impl VectorIndexParams {
Expand All @@ -165,17 +194,22 @@ impl VectorIndexParams {
/// - `num_partitions`: the number of IVF partitions.
/// - `nbits`: the number of bits to present the centroids used in PQ. Can only be `8` for now.
/// - `num_sub_vectors`: the number of sub vectors used in PQ.
/// - `metric_type`: how to compute distance, i.e., `L2` or `Cosine`.
pub fn ivf_pq(
num_partitions: u32,
nbits: u8,
num_sub_vectors: u32,
use_opq: bool,
metric_type: MetricType,
max_iterations: usize,
) -> Self {
Self {
num_partitions,
nbits,
num_sub_vectors,
use_opq,
metric_type,
max_iterations,
}
}
}
Expand All @@ -186,7 +220,9 @@ impl Default for VectorIndexParams {
num_partitions: 32,
nbits: 8,
num_sub_vectors: 16,
use_opq: true,
metric_type: MetricType::L2,
max_iterations: MAX_ITERATIONS, // Faiss
}
}
}
Expand Down
Loading

0 comments on commit 299af4a

Please sign in to comment.