From 3f2b1ee33c62dc786659607fa8e632a45eff0f4d Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 28 Oct 2024 13:35:11 -0700 Subject: [PATCH 1/3] Verify num_sub_vectors is valid before creating index --- python/python/lance/dataset.py | 20 +++++++++++----- python/python/lance/indices.py | 4 ++-- python/python/tests/test_indices.py | 10 ++++++++ python/python/tests/test_vector_index.py | 29 ++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 9d03323114..2ab55250a1 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -1696,17 +1696,25 @@ def create_index( if c not in self.schema.names: raise KeyError(f"{c} not found in schema") field = self.schema.field(c) - if not ( - pa.types.is_fixed_size_list(field.type) - or ( - isinstance(field.type, pa.FixedShapeTensorType) - and len(field.type.shape) == 1 - ) + if pa.types.is_fixed_size_list(field.type): + dimension = field.type.list_size + elif ( + isinstance(field.type, pa.FixedShapeTensorType) + and len(field.type.shape) == 1 ): + dimension = field.type.shape[0] + else: raise TypeError( f"Vector column {c} must be FixedSizeListArray " f"1-dimensional FixedShapeTensorArray, got {field.type}" ) + + if num_sub_vectors is not None and dimension % num_sub_vectors != 0: + raise ValueError( + f"dimension ({dimension}) must be divisible by num_sub_vectors" + f" ({num_sub_vectors})" + ) + if not pa.types.is_floating(field.type.value_type): raise TypeError( f"Vector column {c} must have floating value type, " diff --git a/python/python/lance/indices.py b/python/python/lance/indices.py index d1065bf430..3055a5d3fc 100644 --- a/python/python/lance/indices.py +++ b/python/python/lance/indices.py @@ -545,8 +545,8 @@ def _normalize_pq_params(self, num_subvectors: int, dimension: int): ) if dimension % num_subvectors != 0: raise ValueError( - "dimension ({dimension}) must be divisible by num_subvectors" - " ({num_subvectors}) without remainder" + f"dimension ({dimension}) must be divisible by num_subvectors" + f" ({num_subvectors}) without remainder" ) return num_subvectors diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 72c08540ff..7d5315be27 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -156,6 +156,16 @@ def test_gen_pq(tmpdir, rand_dataset, rand_ivf): assert pq.codebook == reloaded.codebook +def test_pq_invalid_sub_vectors(tmpdir, rand_dataset, rand_ivf): + with pytest.raises( + ValueError, + match="must be divisible by num_subvectors .* without remainder", + ): + IndicesBuilder(rand_dataset, "vectors").train_pq( + rand_ivf, sample_rate=2, num_subvectors=5 + ) + + def test_gen_pq_mostly_null(mostly_null_dataset): centroids = np.random.rand(DIMENSION * 100).astype(np.float32) centroids = pa.FixedSizeListArray.from_arrays(centroids, DIMENSION) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9164036fd7..247f595198 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -135,6 +135,35 @@ def func(rs: pa.Table): run(dataset, q=np.array(q), assert_func=func) +def test_invalid_subvectors(tmp_path): + tbl = create_table() + dataset = lance.write_dataset(tbl, tmp_path) + with pytest.raises( + ValueError, + match="dimension .* must be divisible by num_sub_vectors", + ): + dataset.create_index( + "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=15 + ) + + +@pytest.mark.cuda +def test_invalid_subvectors_cuda(tmp_path): + tbl = create_table() + dataset = lance.write_dataset(tbl, tmp_path) + with pytest.raises( + ValueError, + match="dimension .* must be divisible by num_sub_vectors", + ): + dataset.create_index( + "vector", + index_type="IVF_PQ", + num_partitions=4, + num_sub_vectors=15, + accelerator="cuda", + ) + + def test_index_with_nans(tmp_path): # 1024 rows, the entire table should be sampled tbl = create_table(nvec=1000, nans=24) From f5addd4c21df6bb677237ec0e9c10b36c7664d03 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 28 Oct 2024 14:01:34 -0700 Subject: [PATCH 2/3] Fix cuda test broken when the default num partitions logic changed to ceil instead of round at some point --- python/python/tests/test_indices.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 7d5315be27..2661e8d193 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import math import os import pathlib @@ -15,7 +16,7 @@ NUM_SUBVECTORS = 8 NUM_FRAGMENTS = 3 NUM_ROWS = NUM_ROWS_PER_FRAGMENT * NUM_FRAGMENTS -NUM_PARTITIONS = round(np.sqrt(NUM_ROWS)) +NUM_PARTITIONS = math.ceil(np.sqrt(NUM_ROWS)) SMALL_ROWS_PER_FRAGMENT = 100 From 49c70cd78699f69e4036df823d4435f6a729b2a3 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 28 Oct 2024 15:18:01 -0700 Subject: [PATCH 3/3] Only change num_partitions for CUDA test --- python/python/tests/test_indices.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 2661e8d193..26ab6e9916 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -16,7 +16,7 @@ NUM_SUBVECTORS = 8 NUM_FRAGMENTS = 3 NUM_ROWS = NUM_ROWS_PER_FRAGMENT * NUM_FRAGMENTS -NUM_PARTITIONS = math.ceil(np.sqrt(NUM_ROWS)) +NUM_PARTITIONS = round(np.sqrt(NUM_ROWS)) SMALL_ROWS_PER_FRAGMENT = 100 @@ -94,7 +94,9 @@ def test_ivf_centroids_cuda(rand_dataset): ) assert ivf.distance_type == "l2" - assert len(ivf.centroids) == NUM_PARTITIONS + # Can't use NUM_PARTITIONS here because + # CUDA uses math.ceil and CPU uses round to calc. num_partitions + assert len(ivf.centroids) == math.ceil(np.sqrt(NUM_ROWS)) @pytest.mark.cuda