Add performance regression tests (facebookresearch#3793)

Summary: Add `CMakeList` compile `faiss/perf_tests` benchmarks. We will run the google benchmarks as part of CI so people can see benchmarking results (there is no diff-to-diff regression detection in open-sourced CI) ==== Test Plan ===== Sees logs in CI that looks like ``` Run on (4 X 3184.9 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 512 KiB (x2) L3 Unified 32768 KiB (x1) Load Average: 2.69, 2.84, 1.56 ---------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------- QT_4bit/iterations:20 53646755 ns 53643729 ns 20 code_size=1k QT_4bit_uniform/iterations:20 52248603 ns 52246874 ns 20 code_size=1k QT_6bit/iterations:20 63697930 ns 63693459 ns 20 code_size=1.5k QT_8bit/iterations:20 43305175 ns 43303946 ns 20 code_size=2k QT_8bit_direct/iterations:20 30771920 ns 30770261 ns 20 code_size=2k QT_8bit_direct_signed/iterations:20 30744625 ns 30742891 ns 20 code_size=2k QT_8bit_uniform/iterations:20 44227773 ns 44224242 ns 20 code_size=2k QT_bf16/iterations:20 32758794 ns 32758717 ns 20 code_size=4k QT_fp16/iterations:20 41068848 ns 41066492 ns 20 code_size=4k 2024-09-20T23:15:01+00:00 Running ./build/perf_tests/bench_scalar_quantizer_decode Run on (4 X 3244.56 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 512 KiB (x2) L3 Unified 32768 KiB (x1) Load Average: 2.43, 2.78, 1.56 ---------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------- QT_4bit/iterations:20 338300 ns 338284 ns 20 code_size=64 QT_4bit_uniform/iterations:20 332928 ns 332914 ns 20 code_size=64 QT_6bit/iterations:20 4[1568](https://github.com/facebookresearch/faiss/actions/runs/10966335129/job/30454475438?pr=3878#step:3:1585)3 ns 415674 ns 20 code_size=96 QT_8bit/iterations:20 266034 ns 266026 ns 20 code_size=128 QT_8bit_direct/iterations:20 37552 ns 37553 ns 20 code_size=128 QT_8bit_direct_signed/iterations:20 39701 ns 39696 ns 20 code_size=128 QT_8bit_uniform/iterations:20 261535 ns 261529 ns 20 code_size=128 QT_bf16/iterations:20 45518 ns 45506 ns 20 code_size=256 QT_fp16/iterations:20 334602 ns 334584 ns 20 code_size=256 2024-09-20T23:15:02+00:00 Running ./build/perf_tests/bench_no_multithreading_rcq_search Run on (4 X 3243.03 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 512 KiB (x2) L3 Unified 32768 KiB (x1) Load Average: 2.43, 2.78, 1.56 WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points WARNING clustering 65536 points to 65536 centroids: please provide at least 2555904 training points --------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------- search/iterations:20 12763792 ns 10367188 ns 20 2024-09-20T23:15:51+00:00 Running ./build/perf_tests/bench_scalar_quantizer_accuracy Run on (4 X 3231.04 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 512 KiB (x2) L3 Unified 32768 KiB (x1) Load Average: 2.85, 2.84, 1.65 ---------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------- QT_4bit/iterations:20 0.000 ns 0.000 ns 0 code_size=64 code_size_two=128k ndiff_for_idempotence=0 sql2_recons_error=0.047396 QT_4bit_uniform/iterations:20 0.000 ns 0.000 ns 0 code_size=64 code_size_two=128k ndiff_for_idempotence=0 sql2_recons_error=0.0473931 QT_6bit/iterations:20 0.000 ns 0.000 ns 0 code_size=96 code_size_two=192k ndiff_for_idempotence=0 sql2_recons_error=2.6899m QT_8bit/iterations:20 0.000 ns 0.000 ns 0 code_size=128 code_size_two=256k ndiff_for_idempotence=0 sql2_recons_error=164.317u QT_8bit_direct/iterations:20 0.000 ns 0.000 ns 0 code_size=128 code_size_two=256k ndiff_for_idempotence=0 sql2_recons_error=42.5514 QT_8bit_direct_signed/iterations:20 0.000 ns 0.000 ns 0 code_size=128 code_size_two=256k ndiff_for_idempotence=0 sql2_recons_error=42.5494 QT_8bit_uniform/iterations:20 0.000 ns 0.000 ns 0 code_size=128 code_size_two=256k ndiff_for_idempotence=0 sql2_recons_error=164.152u QT_bf16/iterations:20 0.000 ns 0.000 ns 0 code_size=256 code_size_two=512k ndiff_for_idempotence=0 sql2_recons_error=92.8328u QT_fp16/iterations:20 0.000 ns 0.000 ns 0 code_size=256 code_size_two=512k ndiff_for_idempotence=0 sql2_recons_error=1.44838u 2024-09-20T23:15:51+00:00 Running ./build/perf_tests/bench_scalar_quantizer_encode Run on (4 X 3243.72 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 512 KiB (x2) L3 Unified 32768 KiB (x1) Load Average: 2.85, 2.84, 1.65 ---------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------- QT_4bit/iterations:20 702046 ns 701319 ns 20 code_size=64 QT_4bit_uniform/iterations:20 595889 ns 595880 ns 20 code_size=64 QT_6bit/iterations:20 1287503 ns 1287542 ns 20 code_size=96 QT_8bit/iterations:20 511811 ns 511804 ns 20 code_size=128 QT_8bit_direct/iterations:20 152977 ns 152970 ns 20 code_size=128 QT_8bit_direct_signed/iterations:20 185578 ns 185572 ns 20 code_size=128 QT_8bit_uniform/iterations:20 454412 ns 454408 ns 20 code_size=128 QT_bf16/iterations:20 51331 ns 51324 ns 20 code_size=256 QT_fp16/iterations:20 390658 ns 390649 ns 20 code_size=256 ``` Pull Request resolved: facebookresearch#3793 Reviewed By: junjieqi Differential Revision: D63147599 Pulled By: mengdilin fbshipit-source-id: 03165b5acb3b0647a69f7db144ab76efda2fee11
blevesearch · Sep 23, 2024 · 149c1f4 · 149c1f4
1 parent 0660b23
commit 149c1f4
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 2 deletions.
diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake make swig numpy scipy pytest
+        conda install -y -q python=3.11 cmake make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then
@@ -143,6 +143,11 @@ runs:
       run: |
         export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
         make -C build test
+    - name: C++ perf benchmarks
+      shell: bash
+      if: inputs.rocm == 'OFF'
+      run: |
+        find ./build/perf_tests/ -executable -type f -name "bench*" -exec '{}' -v \;
     - name: Install Python extension
       shell: bash
       working-directory: build/faiss/python

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -105,11 +105,12 @@ add_subdirectory(demos)
 add_subdirectory(benchs)
 add_subdirectory(tutorial/cpp)
 
+
 # CTest must be included in the top level to enable `make test` target.
 include(CTest)
 if(BUILD_TESTING)
   add_subdirectory(tests)
-
+  add_subdirectory(perf_tests)
   if(FAISS_ENABLE_GPU)
     if(FAISS_ENABLE_ROCM)
       add_subdirectory(faiss/gpu-rocm/test)

diff --git a/perf_tests/CMakeLists.txt b/perf_tests/CMakeLists.txt
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# @lint-ignore-every LINEWRAP
+project(faiss_perf_tests)
+set(BENCHMARK_ENABLE_TESTING OFF)
+
+include(FetchContent)
+FetchContent_Declare(googlebenchmark
+        GIT_REPOSITORY https://github.com/google/benchmark.git
+        GIT_TAG main) # need main for benchmark::benchmark
+FetchContent_MakeAvailable(
+  googlebenchmark)
+
+
+find_package(Threads REQUIRED)
+find_package(OpenMP REQUIRED)
+find_package(gflags REQUIRED)
+
+add_library(faiss_perf_tests_utils
+  utils.cpp
+)
+# `#include <faiss/perf_tests/utils.h>` or any other headers
+target_include_directories(faiss_perf_tests_utils PRIVATE
+   ${PROJECT_SOURCE_DIR}/../..)
+
+function(link_to_faiss_lib target)
+  if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "sve")
+    target_link_libraries(${target} PRIVATE faiss)
+  endif()
+
+  if(FAISS_OPT_LEVEL STREQUAL "avx2")
+    if(NOT WIN32)
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma>)
+    else()
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+    endif()
+    target_link_libraries(${target} PRIVATE faiss_avx2)
+  endif()
+
+  if(FAISS_OPT_LEVEL STREQUAL "avx512")
+    if(NOT WIN32)
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mavx512f -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw>)
+    else()
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+    endif()
+    target_link_libraries(${target} PRIVATE faiss_avx512)
+  endif()
+
+  if(FAISS_OPT_LEVEL STREQUAL "sve")
+    if(NOT WIN32)
+      if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
+        # Do nothing, expect SVE to be enabled by -march=native
+      elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+        # Add +sve
+        target_compile_options(${target}  PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CMAKE_MATCH_2}+sve>)
+      elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=armv")
+        # No valid -march, so specify -march=armv8-a+sve as the default
+        target_compile_options(${target} PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:-march=armv8-a+sve>)
+      endif()
+      if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=native")
+        # Do nothing, expect SVE to be enabled by -march=native
+      elseif("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )(-march=armv[0-9]+(\\.[1-9]+)?-[^+ ](\\+[^+$ ]+)*)")
+        # Add +sve
+        target_compile_options(${target}  PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${CMAKE_MATCH_2}+sve>)
+      elseif(NOT "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE} " MATCHES "(^| )-march=armv")
+        # No valid -march, so specify -march=armv8-a+sve as the default
+        target_compile_options(${target} PRIVATE $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:-march=armv8-a+sve>)
+      endif()
+    else()
+      # TODO: support Windows
+    endif()
+    target_link_libraries(${target} PRIVATE faiss_sve)
+  endif()
+endfunction()
+
+link_to_faiss_lib(faiss_perf_tests_utils)
+
+set(FAISS_PERF_TEST_SRC
+  bench_no_multithreading_rcq_search.cpp
+  bench_scalar_quantizer_accuracy.cpp
+  bench_scalar_quantizer_decode.cpp
+  bench_scalar_quantizer_distance.cpp
+  bench_scalar_quantizer_encode.cpp
+)
+foreach(bench ${FAISS_PERF_TEST_SRC})
+  get_filename_component(bench_exec ${bench} NAME_WE)
+  add_executable(${bench_exec} ${bench})
+  link_to_faiss_lib(${bench_exec})
+  target_link_libraries(${bench_exec} PRIVATE faiss_perf_tests_utils OpenMP::OpenMP_CXX benchmark::benchmark gflags)
+  # `#include <faiss/perf_tests/utils.h>` or any other headers
+  target_include_directories(${bench_exec} PRIVATE
+   ${PROJECT_SOURCE_DIR}/../..)
+
+endforeach()