lancedb · westonpace · Oct 29, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1696,17 +1696,25 @@ def create_index(
             if c not in self.schema.names:
                 raise KeyError(f"{c} not found in schema")
             field = self.schema.field(c)
-            if not (
-                pa.types.is_fixed_size_list(field.type)
-                or (
-                    isinstance(field.type, pa.FixedShapeTensorType)
-                    and len(field.type.shape) == 1
-                )
+            if pa.types.is_fixed_size_list(field.type):
+                dimension = field.type.list_size
+            elif (
+                isinstance(field.type, pa.FixedShapeTensorType)
+                and len(field.type.shape) == 1
             ):
+                dimension = field.type.shape[0]
+            else:
                 raise TypeError(
                     f"Vector column {c} must be FixedSizeListArray "
                     f"1-dimensional FixedShapeTensorArray, got {field.type}"
                 )
+
+            if num_sub_vectors is not None and dimension % num_sub_vectors != 0:
+                raise ValueError(
+                    f"dimension ({dimension}) must be divisible by num_sub_vectors"
+                    f" ({num_sub_vectors})"
+                )
+
             if not pa.types.is_floating(field.type.value_type):
                 raise TypeError(
                     f"Vector column {c} must have floating value type, "

diff --git a/python/python/lance/indices.py b/python/python/lance/indices.py
@@ -545,8 +545,8 @@ def _normalize_pq_params(self, num_subvectors: int, dimension: int):
             )
         if dimension % num_subvectors != 0:
             raise ValueError(
-                "dimension ({dimension}) must be divisible by num_subvectors"
-                " ({num_subvectors}) without remainder"
+                f"dimension ({dimension}) must be divisible by num_subvectors"
+                f" ({num_subvectors}) without remainder"
             )
         return num_subvectors
 

diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright The Lance Authors
+import math
 import os
 import pathlib
 
@@ -93,7 +94,9 @@ def test_ivf_centroids_cuda(rand_dataset):
     )
 
     assert ivf.distance_type == "l2"
-    assert len(ivf.centroids) == NUM_PARTITIONS
+    # Can't use NUM_PARTITIONS here because
+    # CUDA uses math.ceil and CPU uses round to calc. num_partitions
+    assert len(ivf.centroids) == math.ceil(np.sqrt(NUM_ROWS))
 
 
 @pytest.mark.cuda
@@ -156,6 +159,16 @@ def test_gen_pq(tmpdir, rand_dataset, rand_ivf):
     assert pq.codebook == reloaded.codebook
 
 
+def test_pq_invalid_sub_vectors(tmpdir, rand_dataset, rand_ivf):
+    with pytest.raises(
+        ValueError,
+        match="must be divisible by num_subvectors .* without remainder",
+    ):
+        IndicesBuilder(rand_dataset, "vectors").train_pq(
+            rand_ivf, sample_rate=2, num_subvectors=5
+        )
+
+
 def test_gen_pq_mostly_null(mostly_null_dataset):
     centroids = np.random.rand(DIMENSION * 100).astype(np.float32)
     centroids = pa.FixedSizeListArray.from_arrays(centroids, DIMENSION)

diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
@@ -135,6 +135,35 @@ def func(rs: pa.Table):
     run(dataset, q=np.array(q), assert_func=func)
 
 
+def test_invalid_subvectors(tmp_path):
+    tbl = create_table()
+    dataset = lance.write_dataset(tbl, tmp_path)
+    with pytest.raises(
+        ValueError,
+        match="dimension .* must be divisible by num_sub_vectors",
+    ):
+        dataset.create_index(
+            "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=15
+        )
+
+
+@pytest.mark.cuda
+def test_invalid_subvectors_cuda(tmp_path):
+    tbl = create_table()
+    dataset = lance.write_dataset(tbl, tmp_path)
+    with pytest.raises(
+        ValueError,
+        match="dimension .* must be divisible by num_sub_vectors",
+    ):
+        dataset.create_index(
+            "vector",
+            index_type="IVF_PQ",
+            num_partitions=4,
+            num_sub_vectors=15,
+            accelerator="cuda",
+        )
+
+
 def test_index_with_nans(tmp_path):
     # 1024 rows, the entire table should be sampled
     tbl = create_table(nvec=1000, nans=24)