diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 9d03323114..2ab55250a1 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -1696,17 +1696,25 @@ def create_index( if c not in self.schema.names: raise KeyError(f"{c} not found in schema") field = self.schema.field(c) - if not ( - pa.types.is_fixed_size_list(field.type) - or ( - isinstance(field.type, pa.FixedShapeTensorType) - and len(field.type.shape) == 1 - ) + if pa.types.is_fixed_size_list(field.type): + dimension = field.type.list_size + elif ( + isinstance(field.type, pa.FixedShapeTensorType) + and len(field.type.shape) == 1 ): + dimension = field.type.shape[0] + else: raise TypeError( f"Vector column {c} must be FixedSizeListArray " f"1-dimensional FixedShapeTensorArray, got {field.type}" ) + + if num_sub_vectors is not None and dimension % num_sub_vectors != 0: + raise ValueError( + f"dimension ({dimension}) must be divisible by num_sub_vectors" + f" ({num_sub_vectors})" + ) + if not pa.types.is_floating(field.type.value_type): raise TypeError( f"Vector column {c} must have floating value type, " diff --git a/python/python/lance/indices.py b/python/python/lance/indices.py index d1065bf430..3055a5d3fc 100644 --- a/python/python/lance/indices.py +++ b/python/python/lance/indices.py @@ -545,8 +545,8 @@ def _normalize_pq_params(self, num_subvectors: int, dimension: int): ) if dimension % num_subvectors != 0: raise ValueError( - "dimension ({dimension}) must be divisible by num_subvectors" - " ({num_subvectors}) without remainder" + f"dimension ({dimension}) must be divisible by num_subvectors" + f" ({num_subvectors}) without remainder" ) return num_subvectors diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 72c08540ff..26ab6e9916 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import math import os import pathlib @@ -93,7 +94,9 @@ def test_ivf_centroids_cuda(rand_dataset): ) assert ivf.distance_type == "l2" - assert len(ivf.centroids) == NUM_PARTITIONS + # Can't use NUM_PARTITIONS here because + # CUDA uses math.ceil and CPU uses round to calc. num_partitions + assert len(ivf.centroids) == math.ceil(np.sqrt(NUM_ROWS)) @pytest.mark.cuda @@ -156,6 +159,16 @@ def test_gen_pq(tmpdir, rand_dataset, rand_ivf): assert pq.codebook == reloaded.codebook +def test_pq_invalid_sub_vectors(tmpdir, rand_dataset, rand_ivf): + with pytest.raises( + ValueError, + match="must be divisible by num_subvectors .* without remainder", + ): + IndicesBuilder(rand_dataset, "vectors").train_pq( + rand_ivf, sample_rate=2, num_subvectors=5 + ) + + def test_gen_pq_mostly_null(mostly_null_dataset): centroids = np.random.rand(DIMENSION * 100).astype(np.float32) centroids = pa.FixedSizeListArray.from_arrays(centroids, DIMENSION) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9164036fd7..247f595198 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -135,6 +135,35 @@ def func(rs: pa.Table): run(dataset, q=np.array(q), assert_func=func) +def test_invalid_subvectors(tmp_path): + tbl = create_table() + dataset = lance.write_dataset(tbl, tmp_path) + with pytest.raises( + ValueError, + match="dimension .* must be divisible by num_sub_vectors", + ): + dataset.create_index( + "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=15 + ) + + +@pytest.mark.cuda +def test_invalid_subvectors_cuda(tmp_path): + tbl = create_table() + dataset = lance.write_dataset(tbl, tmp_path) + with pytest.raises( + ValueError, + match="dimension .* must be divisible by num_sub_vectors", + ): + dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=15, + accelerator="cuda", + ) + + def test_index_with_nans(tmp_path): # 1024 rows, the entire table should be sampled tbl = create_table(nvec=1000, nans=24)