lancedb · BubbleCal · Dec 20, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/python/Cargo.lock b/python/Cargo.lock
diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs
@@ -140,7 +140,12 @@ impl IndexType {
     pub fn is_vector(&self) -> bool {
         matches!(
             self,
-            Self::Vector | Self::IvfPq | Self::IvfHnswSq | Self::IvfHnswPq
+            Self::Vector
+                | Self::IvfPq
+                | Self::IvfHnswSq
+                | Self::IvfHnswPq
+                | Self::IvfFlat
+                | Self::IvfSq
         )
     }
 }

diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs
@@ -182,7 +182,7 @@ pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
     ///
     /// If an old row id is not in the mapping then it should be
     /// left alone.
-    fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
+    async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
 
     /// The metric type of this vector index.
     fn metric_type(&self) -> DistanceType;

diff --git a/rust/lance-index/src/vector/flat/index.rs b/rust/lance-index/src/vector/flat/index.rs
@@ -4,6 +4,7 @@
 //! Flat Vector Index.
 //!
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
 use arrow::array::AsArray;
@@ -134,6 +135,10 @@ impl IvfSubIndex for FlatIndex {
         Ok(Self {})
     }
 
+    fn remap(&self, _: &HashMap<u64, Option<u64>>) -> Result<Self> {
+        Ok(self.clone())
+    }
+
     fn to_batch(&self) -> Result<RecordBatch> {
         Ok(RecordBatch::new_empty(Schema::empty().into()))
     }

diff --git a/rust/lance-index/src/vector/flat/storage.rs b/rust/lance-index/src/vector/flat/storage.rs
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-//! In-memory graph representations.
-
 use std::sync::Arc;
 
 use crate::vector::quantizer::QuantizerStorage;

diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs
@@ -749,6 +749,10 @@ impl IvfSubIndex for HNSW {
         Ok(hnsw)
     }
 
+    fn remap(&self, _mapping: &HashMap<u64, Option<u64>>) -> Result<Self> {
+        unimplemented!("HNSW remap is not supported yet");
+    }
+
     /// Encode the sub index into a record batch
     fn to_batch(&self) -> Result<RecordBatch> {
         let mut vector_id_builder = UInt32Builder::with_capacity(self.len());

diff --git a/rust/lance-index/src/vector/hnsw/index.rs b/rust/lance-index/src/vector/hnsw/index.rs
@@ -267,7 +267,7 @@ impl<Q: Quantization + Send + Sync + 'static> VectorIndex for HNSWIndex<Q> {
         Box::new(self.storage.as_ref().unwrap().row_ids())
     }
 
-    fn remap(&mut self, _mapping: &HashMap<u64, Option<u64>>) -> Result<()> {
+    async fn remap(&mut self, _mapping: &HashMap<u64, Option<u64>>) -> Result<()> {
         Err(Error::Index {
             message: "Remapping HNSW in this way not supported".to_string(),
             location: location!(),

diff --git a/rust/lance-index/src/vector/quantizer.rs b/rust/lance-index/src/vector/quantizer.rs
@@ -23,7 +23,9 @@ use super::flat::index::{FlatBinQuantizer, FlatQuantizer};
 use super::pq::ProductQuantizer;
 use super::{ivf::storage::IvfModel, sq::ScalarQuantizer, storage::VectorStore};
 
-pub trait Quantization: Send + Sync + Debug + DeepSizeOf + Into<Quantizer> {
+pub trait Quantization:
+    Send + Sync + Debug + DeepSizeOf + Into<Quantizer> + TryFrom<Quantizer, Error = lance_core::Error>
+{
     type BuildParams: QuantizerBuildParams;
     type Metadata: QuantizerMetadata + Send + Sync;
     type Storage: QuantizerStorage<Metadata = Self::Metadata> + VectorStore + Debug;

diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs
@@ -3,10 +3,13 @@
 
 //! Vector Storage, holding (quantized) vectors and providing distance calculation.
 
+use std::collections::HashMap;
 use std::{any::Any, sync::Arc};
 
+use arrow::array::AsArray;
 use arrow::compute::concat_batches;
-use arrow_array::{ArrayRef, RecordBatch};
+use arrow::datatypes::UInt64Type;
+use arrow_array::{ArrayRef, RecordBatch, UInt32Array, UInt64Array};
 use arrow_schema::{Field, SchemaRef};
 use deepsize::DeepSizeOf;
 use futures::prelude::stream::TryStreamExt;
@@ -72,6 +75,43 @@ pub trait VectorStore: Send + Sync + Sized + Clone {
 
     fn to_batches(&self) -> Result<impl Iterator<Item = RecordBatch>>;
 
+    fn remap(&self, mapping: &HashMap<u64, Option<u64>>) -> Result<Self> {
+        let batches = self
+            .to_batches()?
+            .map(|b| {
+                let mut indices = Vec::with_capacity(b.num_rows());
+                let mut new_row_ids = Vec::with_capacity(b.num_rows());
+
+                let row_ids = b.column(0).as_primitive::<UInt64Type>().values();
+                for (i, row_id) in row_ids.iter().enumerate() {
+                    match mapping.get(row_id) {
+                        Some(Some(new_id)) => {
+                            indices.push(i as u32);
+                            new_row_ids.push(*new_id);
+                        }
+                        Some(None) => {}
+                        None => {
+                            indices.push(i as u32);
+                            new_row_ids.push(*row_id);
+                        }
+                    }
+                }
+
+                let indices = UInt32Array::from(indices);
+                let new_row_ids = Arc::new(UInt64Array::from(new_row_ids));
+                let new_vectors = arrow::compute::take(b.column(1), &indices, None)?;
+
+                Ok(RecordBatch::try_new(
+                    self.schema().clone(),
+                    vec![new_row_ids, new_vectors],
+                )?)
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let batch = concat_batches(self.schema(), batches.iter())?;
+        Self::try_from_batch(batch, self.distance_type())
+    }
+
     fn len(&self) -> usize;
 
     /// Returns true if this graph is empty.
@@ -219,6 +259,10 @@ impl IvfQuantizationStorage {
         Q::from_metadata(&metadata, self.distance_type)
     }
 
+    pub fn schema(&self) -> SchemaRef {
+        Arc::new(self.reader.schema().as_ref().into())
+    }
+
     /// Get the number of partitions in the storage.
     pub fn num_partitions(&self) -> usize {
         self.ivf.num_partitions()

diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs
@@ -8,15 +8,18 @@ use std::sync::Arc;
 
 use arrow::{array::AsArray, compute::sort_to_indices};
 use arrow_array::{RecordBatch, UInt32Array};
+use arrow_schema::Schema;
 use future::join_all;
 use futures::prelude::*;
-use lance_arrow::RecordBatchExt;
+use itertools::Itertools;
+use lance_arrow::{RecordBatchExt, SchemaExt};
 use lance_core::{
     cache::FileMetadataCache,
     utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu},
     Error, Result,
 };
 use lance_encoding::decoder::{DecoderPlugins, FilterExpression};
+use lance_file::v2::reader::ReaderProjection;
 use lance_file::v2::{
     reader::{FileReader, FileReaderOptions},
     writer::FileWriter,
@@ -256,14 +259,35 @@ impl ShuffleReader for IvfShufflerReader {
             FileReaderOptions::default(),
         )
         .await?;
-        let schema = reader.schema().as_ref().into();
-
+        let schema: Schema = reader.schema().as_ref().into();
+        let projection = schema
+            .fields()
+            .iter()
+            .enumerate()
+            .filter_map(|(index, f)| {
+                if f.name() != PART_ID_COLUMN {
+                    Some(index)
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+        let schema = schema.project(&projection)?;
+        let projection = ReaderProjection::from_column_names(
+            reader.schema().as_ref(),
+            &schema
+                .field_names()
+                .into_iter()
+                .map(|s| s.as_ref())
+                .collect_vec(),
+        )?;
         Ok(Some(Box::new(RecordBatchStreamAdapter::new(
             Arc::new(schema),
-            reader.read_stream(
+            reader.read_stream_projected(
                 lance_io::ReadBatchParams::RangeFull,
                 4096,
                 16,
+                projection,
                 FilterExpression::no_filter(),
             )?,
         ))))

diff --git a/rust/lance-index/src/vector/v3/subindex.rs b/rust/lance-index/src/vector/v3/subindex.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
+use std::collections::HashMap;
 use std::fmt::Debug;
 use std::sync::Arc;
 
@@ -49,6 +50,10 @@ pub trait IvfSubIndex: Send + Sync + Debug + DeepSizeOf {
     where
         Self: Sized;
 
+    fn remap(&self, mapping: &HashMap<u64, Option<u64>>) -> Result<Self>
+    where
+        Self: Sized;
+
     /// Encode the sub index into a record batch
     fn to_batch(&self) -> Result<RecordBatch>;
 }

diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs
@@ -41,6 +41,7 @@ use object_store::path::Path;
 use snafu::{location, Location};
 use tempfile::tempdir;
 use tracing::instrument;
+use utils::get_vector_element_type;
 use uuid::Uuid;
 
 use self::{ivf::*, pq::PQIndex};
@@ -253,57 +254,39 @@ pub(crate) async fn build_vector_index(
     let temp_dir_path = Path::from_filesystem_path(temp_dir.path())?;
     let shuffler = IvfShuffler::new(temp_dir_path, ivf_params.num_partitions);
     if is_ivf_flat(stages) {
-        let data_type = dataset
-            .schema()
-            .field(column)
-            .ok_or(Error::Schema {
-                message: format!("Column {} not found in schema", column),
-                location: location!(),
-            })?
-            .data_type();
-        match data_type {
-            DataType::FixedSizeList(f, _) => match f.data_type() {
-                DataType::Float16 | DataType::Float32 | DataType::Float64 => {
-                    IvfIndexBuilder::<FlatIndex, FlatQuantizer>::new(
-                        dataset.clone(),
-                        column.to_owned(),
-                        dataset.indices_dir().child(uuid),
-                        params.metric_type,
-                        Box::new(shuffler),
-                        Some(ivf_params.clone()),
-                        Some(()),
-                        (),
-                    )?
-                    .build()
-                    .await?;
-                }
-                DataType::UInt8 => {
-                    IvfIndexBuilder::<FlatIndex, FlatBinQuantizer>::new(
-                        dataset.clone(),
-                        column.to_owned(),
-                        dataset.indices_dir().child(uuid),
-                        params.metric_type,
-                        Box::new(shuffler),
-                        Some(ivf_params.clone()),
-                        Some(()),
-                        (),
-                    )?
-                    .build()
-                    .await?;
-                }
-                _ => {
-                    return Err(Error::Index {
-                        message: format!(
-                            "Build Vector Index: invalid data type: {:?}",
-                            f.data_type()
-                        ),
-                        location: location!(),
-                    });
-                }
-            },
+        let element_type = get_vector_element_type(dataset, column)?;
+        match element_type {
+            DataType::Float16 | DataType::Float32 | DataType::Float64 => {
+                IvfIndexBuilder::<FlatIndex, FlatQuantizer>::new(
+                    dataset.clone(),
+                    column.to_owned(),
+                    dataset.indices_dir().child(uuid),
+                    params.metric_type,
+                    Box::new(shuffler),
+                    Some(ivf_params.clone()),
+                    Some(()),
+                    (),
+                )?
+                .build()
+                .await?;
+            }
+            DataType::UInt8 => {
+                IvfIndexBuilder::<FlatIndex, FlatBinQuantizer>::new(
+                    dataset.clone(),
+                    column.to_owned(),
+                    dataset.indices_dir().child(uuid),
+                    params.metric_type,
+                    Box::new(shuffler),
+                    Some(ivf_params.clone()),
+                    Some(()),
+                    (),
+                )?
+                .build()
+                .await?;
+            }
             _ => {
                 return Err(Error::Index {
-                    message: format!("Build Vector Index: invalid data type: {:?}", data_type),
+                    message: format!("Build Vector Index: invalid data type: {:?}", element_type),
                     location: location!(),
                 });
             }
@@ -416,30 +399,35 @@ pub(crate) async fn remap_vector_index(
         .open_vector_index(column, &old_uuid.to_string())
         .await?;
     old_index.check_can_remap()?;
-    let ivf_index: &IVFIndex =
-        old_index
-            .as_any()
-            .downcast_ref()
-            .ok_or_else(|| Error::NotSupported {
-                source: "Only IVF indexes can be remapped currently".into(),
-                location: location!(),
-            })?;
-
-    remap_index_file(
-        dataset.as_ref(),
-        &old_uuid.to_string(),
-        &new_uuid.to_string(),
-        old_metadata.dataset_version,
-        ivf_index,
-        mapping,
-        old_metadata.name.clone(),
-        column.to_string(),
-        // We can safely assume there are no transforms today.  We assert above that the
-        // top stage is IVF and IVF does not support transforms between IVF and PQ.  This
-        // will be fixed in the future.
-        vec![],
-    )
-    .await?;
+
+    if let Some(ivf_index) = old_index.as_any().downcast_ref::<IVFIndex>() {
+        remap_index_file(
+            dataset.as_ref(),
+            &old_uuid.to_string(),
+            &new_uuid.to_string(),
+            old_metadata.dataset_version,
+            ivf_index,
+            mapping,
+            old_metadata.name.clone(),
+            column.to_string(),
+            // We can safely assume there are no transforms today.  We assert above that the
+            // top stage is IVF and IVF does not support transforms between IVF and PQ.  This
+            // will be fixed in the future.
+            vec![],
+        )
+        .await?;
+    } else {
+        // it's v3 index
+        remap_index_file_v3(
+            dataset.as_ref(),
+            &new_uuid.to_string(),
+            old_index,
+            mapping,
+            column.to_string(),
+        )
+        .await?;
+    }
+
     Ok(())
 }