Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: verify num_sub_vectors is valid before creating index #3056

Merged
merged 3 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1696,17 +1696,25 @@ def create_index(
if c not in self.schema.names:
raise KeyError(f"{c} not found in schema")
field = self.schema.field(c)
if not (
pa.types.is_fixed_size_list(field.type)
or (
isinstance(field.type, pa.FixedShapeTensorType)
and len(field.type.shape) == 1
)
if pa.types.is_fixed_size_list(field.type):
dimension = field.type.list_size
elif (
isinstance(field.type, pa.FixedShapeTensorType)
and len(field.type.shape) == 1
):
dimension = field.type.shape[0]
else:
raise TypeError(
f"Vector column {c} must be FixedSizeListArray "
f"1-dimensional FixedShapeTensorArray, got {field.type}"
)

if num_sub_vectors is not None and dimension % num_sub_vectors != 0:
raise ValueError(
f"dimension ({dimension}) must be divisible by num_sub_vectors"
f" ({num_sub_vectors})"
)

if not pa.types.is_floating(field.type.value_type):
raise TypeError(
f"Vector column {c} must have floating value type, "
Expand Down
4 changes: 2 additions & 2 deletions python/python/lance/indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,8 @@ def _normalize_pq_params(self, num_subvectors: int, dimension: int):
)
if dimension % num_subvectors != 0:
raise ValueError(
"dimension ({dimension}) must be divisible by num_subvectors"
" ({num_subvectors}) without remainder"
f"dimension ({dimension}) must be divisible by num_subvectors"
f" ({num_subvectors}) without remainder"
)
return num_subvectors

Expand Down
15 changes: 14 additions & 1 deletion python/python/tests/test_indices.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
import math
import os
import pathlib

Expand Down Expand Up @@ -93,7 +94,9 @@ def test_ivf_centroids_cuda(rand_dataset):
)

assert ivf.distance_type == "l2"
assert len(ivf.centroids) == NUM_PARTITIONS
# Can't use NUM_PARTITIONS here because
# CUDA uses math.ceil and CPU uses round to calc. num_partitions
assert len(ivf.centroids) == math.ceil(np.sqrt(NUM_ROWS))


@pytest.mark.cuda
Expand Down Expand Up @@ -156,6 +159,16 @@ def test_gen_pq(tmpdir, rand_dataset, rand_ivf):
assert pq.codebook == reloaded.codebook


def test_pq_invalid_sub_vectors(tmpdir, rand_dataset, rand_ivf):
with pytest.raises(
ValueError,
match="must be divisible by num_subvectors .* without remainder",
):
IndicesBuilder(rand_dataset, "vectors").train_pq(
rand_ivf, sample_rate=2, num_subvectors=5
)


def test_gen_pq_mostly_null(mostly_null_dataset):
centroids = np.random.rand(DIMENSION * 100).astype(np.float32)
centroids = pa.FixedSizeListArray.from_arrays(centroids, DIMENSION)
Expand Down
29 changes: 29 additions & 0 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,35 @@ def func(rs: pa.Table):
run(dataset, q=np.array(q), assert_func=func)


def test_invalid_subvectors(tmp_path):
tbl = create_table()
dataset = lance.write_dataset(tbl, tmp_path)
with pytest.raises(
ValueError,
match="dimension .* must be divisible by num_sub_vectors",
):
dataset.create_index(
"vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=15
)


@pytest.mark.cuda
def test_invalid_subvectors_cuda(tmp_path):
tbl = create_table()
dataset = lance.write_dataset(tbl, tmp_path)
with pytest.raises(
ValueError,
match="dimension .* must be divisible by num_sub_vectors",
):
dataset.create_index(
"vector",
index_type="IVF_PQ",
num_partitions=4,
num_sub_vectors=15,
accelerator="cuda",
)


def test_index_with_nans(tmp_path):
# 1024 rows, the entire table should be sampled
tbl = create_table(nvec=1000, nans=24)
Expand Down