-
Notifications
You must be signed in to change notification settings - Fork 245
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Streaming PQ #689
Streaming PQ #689
Changes from 20 commits
4c97641
4525466
2029a23
6b3646f
762d8bd
2973fd2
96baecb
03794f6
4cc82eb
307d781
b249b1a
3d9d8db
f231306
c5a47bf
9fe206a
a500810
503265f
9a9225c
f364ee8
3ecd78d
e4a5b12
8c6c7ad
3db88c4
6a08edc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,10 +36,8 @@ use self::scanner::Scanner; | |
use crate::arrow::*; | ||
use crate::datatypes::Schema; | ||
use crate::format::{pb, Fragment, Index, Manifest}; | ||
use crate::index::{ | ||
vector::{ivf::IvfPqIndexBuilder, VectorIndexParams}, | ||
IndexBuilder, IndexParams, IndexType, | ||
}; | ||
use crate::index::vector::ivf::{build_ivf_pq_index, IvfBuildParams, PQBuildParams}; | ||
use crate::index::{vector::VectorIndexParams, IndexParams, IndexType}; | ||
use crate::io::{ | ||
object_reader::{read_message, read_struct}, | ||
read_manifest, read_metadata_offset, write_manifest, FileReader, FileWriter, ObjectStore, | ||
|
@@ -277,7 +275,7 @@ impl Dataset { | |
let base = object_store.base_path().clone(); | ||
Ok(Self { | ||
object_store, | ||
base, | ||
base: base.into(), | ||
manifest: Arc::new(manifest.clone()), | ||
}) | ||
} | ||
|
@@ -381,16 +379,27 @@ impl Dataset { | |
} | ||
} | ||
|
||
let builder = IvfPqIndexBuilder::try_new( | ||
let ivf_params = IvfBuildParams { | ||
num_partitions: vec_params.num_partitions as usize, | ||
metric_type: vec_params.metric_type, | ||
max_iters: 50, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are these values from FAISS? |
||
}; | ||
let pq_params = PQBuildParams { | ||
num_sub_vectors: vec_params.num_sub_vectors as usize, | ||
num_bits: 8, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have to change much to support configurable num_bits? is it just exposing a new API parameter? or is the underlying index creation hard coded to 8 bits? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
metric_type: vec_params.metric_type, | ||
use_opq: true, | ||
max_iters: 100, | ||
}; | ||
build_ivf_pq_index( | ||
self, | ||
index_id, | ||
&index_name, | ||
column, | ||
vec_params.num_partitions, | ||
vec_params.num_sub_vectors, | ||
vec_params.metric_type, | ||
)?; | ||
builder.build().await? | ||
&index_name, | ||
&index_id, | ||
&ivf_params, | ||
&pq_params, | ||
) | ||
.await? | ||
} | ||
} | ||
|
||
|
@@ -521,6 +530,14 @@ impl Dataset { | |
Ok(as_struct_array(&reordered).into()) | ||
} | ||
|
||
/// Sample `n` rows from the dataset. | ||
pub(crate) async fn sample(&self, n: usize, projection: &Schema) -> Result<RecordBatch> { | ||
use rand::seq::IteratorRandom; | ||
let num_rows = self.count_rows().await?; | ||
let ids = (0..num_rows).choose_multiple(&mut rand::thread_rng(), n); | ||
Ok(self.take(&ids[..], &projection).await?) | ||
} | ||
|
||
pub(crate) fn object_store(&self) -> &ObjectStore { | ||
&self.object_store | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be an error?