lancedb · wjones127 · Sep 8, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ python/lance/_*.cpp
 
 python/thirdparty/arrow/
 python/wheels
+python/benchmark_data
 
 logs
 *.ckpt

diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -31,3 +31,7 @@ serde_json = "1"
 
 [build-dependencies]
 prost-build = "0.11"
+
+[profile.release-with-debug]
+inherits = "release"
+debug = true
diff --git a/python/DEVELOPMENT.md b/python/DEVELOPMENT.md
@@ -56,3 +56,81 @@ Some lints can be fixed automatically:
 ```shell
 cargo clippy --all-features --fix
 ```
+
+## Benchmarks
+
+The benchmarks in `python/benchmarks` can be used to identify and diagnose 
+performance issues. They are run with [pytest-benchmark](https://pytest-benchmark.readthedocs.io/en/latest/).
+These benchmarks aren't mean to showcase performance on full-scale real world
+datasets; rather they are meant to be useful for developers to iterate on
+performance improvements and to catch performance regressions. Therefore, any
+benchmarks added there should run in less than 5 seconds.
+
+Before running benchmarks, you should build pylance in release mode:
+
+```shell
+maturin develop --profile release-with-debug --extras benchmarks
+```
+
+(You can also use `--release` or `--profile release`, but `--profile release-with-debug`
+will provide debug symbols for profiling.)
+
+Then you can run the benchmarks with
+
+```shell
+pytest python/benchmarks
+```
+
+Note: the first time you run the benchmarks, they may take a while, since they
+will write out test datasets and build vector indices. Once these are built,
+they are re-used between benchmark runs.
+
+### Run a particular benchmark
+
+To filter benchmarks by name, use the usual pytest `-k` flag (this can be a 
+substring match, so you don't need to type the full name):
+
+```shell
+pytest python/benchmarks -k test_ivf_pq_index_search
+```
+
+### Profile a benchmark
+
+If you have [cargo-flamegraph](https://github.com/flamegraph-rs/flamegraph)
+installed, you can create a flamegraph of a benchmark by running:
+
+```shell
+flamegraph -F 100 --no-inline -- $(which python) \
+    -m pytest python/benchmarks \
+    --benchmark-min-time=2 \
+    -k test_ivf_pq_index_search
+```
+
+Note the parameter `--benchmark-min-time`: this controls how many seconds to run
+the benchmark in each round (default 5 rounds). The default is very low but you
+can increase this so that the profile gets more samples.
+
+You can drop the `--no-inline` to have the program try to identify which functions
+were inlined to get more detail, though this will make the processing take
+considerably longer.
+
+This will only work on Linux.
+
+Note that you'll want to run the benchmarks once prior to profiling, so that
+the setup is complete and not captured as part of profiling.
+
+### Compare benchmarks against previous version
+
+You can easily compare the performance of the current version against a previous
+version of pylance. Install the previous version, run the benchmarks, and save
+the output using `--benchmark-save`. Then install the current version and run
+the benchmarks again with `--benchmark-compare`.
+
+```shell
+pip uninstall -y pylance
+pip install pylance==0.4.18
+pytest --benchmark-save=baseline python/benchmarks
+COMPARE_ID=$(ls .benchmarks/*/ | tail -1 | cut -c1-4)
+maturin develop --profile release-with-debug
+pytest --benchmark-compare=$COMPARE_ID python/benchmarks
+```
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -57,6 +57,9 @@ tests = [
     "ml_dtypes",
     "tensorflow",
 ]
+benchmarks = [
+    "pytest-benchmark",
+]
 
 [tool.isort]
 profile = "black"
diff --git a/python/python/benchmarks/conftest.py b/python/python/benchmarks/conftest.py
@@ -0,0 +1,24 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def data_dir():
+    """Return the path to the benchmark data directory.
+
+    This directory holds tests datasets so they can be cached between runs."""
+    return Path(__file__).parent.parent.parent / "benchmark_data"
diff --git a/python/python/benchmarks/test_index.py b/python/python/benchmarks/test_index.py
@@ -0,0 +1,49 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from pathlib import Path
+
+import lance
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+
+N_DIMS = 768
+
+
+@pytest.fixture(scope="module")
+def test_dataset(tmpdir_factory):
+    # We are writing to this, so it's not beneficial to cache it in the data_dir.
+    tmp_path = Path(tmpdir_factory.mktemp("index_dataset"))
+    num_rows = 1_000
+
+    values = pc.random(num_rows * N_DIMS).cast(pa.float32())
+    vectors = pa.FixedSizeListArray.from_arrays(values, N_DIMS)
+    table = pa.table({"vector": vectors})
+
+    dataset = lance.write_dataset(table, tmp_path)
+
+    return dataset
+
+
+@pytest.mark.benchmark(group="create_index")
+def test_create_ivf_pq(test_dataset, benchmark):
+    benchmark(
+        test_dataset.create_index,
+        column="vector",
+        index_type="IVF_PQ",
+        metric_type="L2",
+        num_partitions=8,
+        num_sub_vectors=2,
+        num_bits=8,
+    )
diff --git a/python/python/benchmarks/test_scan.py b/python/python/benchmarks/test_scan.py
@@ -0,0 +1,86 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import random
+from pathlib import Path
+
+import lance
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+
+NUM_ROWS = 1_000_000
+
+
+@pytest.mark.parametrize(
+    "array_factory",
+    [
+        lambda: pa.array(range(NUM_ROWS), type=pa.int32()),
+        lambda: pc.random(NUM_ROWS),
+        lambda: pa.array(
+            [random.choice(["hello", "world", "today"]) for _ in range(NUM_ROWS)],
+            type=pa.string(),
+        ),
+        lambda: pa.array(
+            [random.choice(["hello", "world", "today"]) for _ in range(NUM_ROWS)],
+            type=pa.dictionary(pa.int8(), pa.string()),
+        ),
+        lambda: pa.FixedSizeListArray.from_arrays(
+            pc.random(NUM_ROWS * 128).cast(pa.float32()), 128
+        ),
+    ],
+    ids=["i32", "f64", "string", "dictionary", "vector"],
+)
+@pytest.mark.benchmark(group="scan_single_column")
+def test_scan_integer(tmp_path: Path, benchmark, array_factory):
+    values = array_factory()
+    table = pa.table({"values": values})
+    dataset = lance.write_dataset(table, tmp_path)
+
+    result = benchmark(
+        dataset.to_table,
+    )
+
+    assert result.num_rows == NUM_ROWS
+
+
+@pytest.mark.benchmark(group="scan_table")
+def test_scan_table(tmp_path: Path, benchmark):
+    table = pa.table(
+        {
+            "i": pa.array(range(NUM_ROWS), type=pa.int32()),
+            "f": pc.random(NUM_ROWS).cast(pa.float32()),
+            "s": pa.array(
+                [random.choice(["hello", "world", "today"]) for _ in range(NUM_ROWS)],
+                type=pa.string(),
+            ),
+            "fsl": pa.FixedSizeListArray.from_arrays(
+                pc.random(NUM_ROWS * 128).cast(pa.float32()), 128
+            ),
+            "blob": pa.array(
+                [
+                    random.choice([b"hello", b"world", b"today"])
+                    for _ in range(NUM_ROWS)
+                ],
+                type=pa.binary(),
+            ),
+        }
+    )
+
+    dataset = lance.write_dataset(table, tmp_path)
+
+    result = benchmark(
+        dataset.to_table,
+    )
+
+    assert result.num_rows == NUM_ROWS
diff --git a/python/python/benchmarks/test_search.py b/python/python/benchmarks/test_search.py
@@ -0,0 +1,104 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import shutil
+from pathlib import Path
+
+import lance
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+
+N_DIMS = 768
+
+
+@pytest.fixture(scope="module")
+def test_dataset(data_dir: Path) -> lance.LanceDataset:
+    tmp_path = data_dir / "search_dataset"
+    num_rows = 100_000
+
+    if tmp_path.exists():
+        try:
+            dataset = lance.LanceDataset(tmp_path)
+        except Exception:
+            pass
+        else:
+            return dataset
+
+    # clear any old data there
+    if tmp_path.exists():
+        shutil.rmtree(tmp_path)
+
+    values = pc.random(num_rows * N_DIMS).cast(pa.float32())
+    vectors = pa.FixedSizeListArray.from_arrays(values, N_DIMS)
+    table = pa.table({"vector": vectors})
+
+    dataset = lance.write_dataset(table, tmp_path)
+
+    dataset.create_index(
+        column="vector",
+        index_type="IVF_PQ",
+        metric_type="L2",
+        num_partitions=32,
+        num_sub_vectors=16,
+        num_bits=8,
+    )
+
+    return dataset
+
+
+@pytest.mark.benchmark(group="query_ann")
+def test_knn_search(test_dataset, benchmark):
+    q = pc.random(N_DIMS).cast(pa.float32())
+    result = benchmark(
+        test_dataset.to_table,
+        nearest=dict(
+            column="vector",
+            q=q,
+            k=100,
+            nprobes=10,
+            use_index=False,
+        ),
+    )
+    assert result.num_rows > 0
+
+
+@pytest.mark.benchmark(group="query_ann")
+def test_flat_index_search(test_dataset, benchmark):
+    q = pc.random(N_DIMS).cast(pa.float32())
+    result = benchmark(
+        test_dataset.to_table,
+        nearest=dict(
+            column="vector",
+            q=q,
+            k=100,
+            nprobes=10,
+        ),
+    )
+    assert result.num_rows > 0
+
+
+@pytest.mark.benchmark(group="query_ann")
+def test_ivf_pq_index_search(test_dataset, benchmark):
+    q = pc.random(N_DIMS).cast(pa.float32())
+    result = benchmark(
+        test_dataset.to_table,
+        nearest=dict(
+            column="vector",
+            q=q,
+            k=100,
+            nprobes=10,
+            refine_factor=2,
+        ),
+    )
+    assert result.num_rows > 0
diff --git a/python/python/tests/conftest.py b/python/python/tests/conftest.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
 import os
 import sys
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ python/lance/_*.cpp @@
     python/thirdparty/arrow/
     python/wheels
+    python/benchmark_data
     logs
     *.ckpt
@@ Expand Down @@