Add: BLAS with zero-stride

ashvardanian · Jan 19, 2025 · c39768e · c39768e
1 parent 8d7f61d
commit c39768e
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 14 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -2,6 +2,8 @@
     "cSpell.words": [
         "ashvardanian",
         "blas",
+        "blasint",
+        "cblas",
         "CCCL",
         "constexpr",
         "cublas",
@@ -27,8 +29,13 @@
         "*.ipp": "cpp",
         "*.metal": "cpp",
         "*.tcc": "cpp",
+        "__bit_reference": "cpp",
+        "__config": "cpp",
         "__debug": "cpp",
+        "__hash_table": "cpp",
         "__nullptr": "cpp",
+        "__split_buffer": "cpp",
+        "__tree": "cpp",
         "algorithm": "cpp",
         "any": "cpp",
         "array": "cpp",
@@ -63,13 +70,16 @@
         "deque": "cpp",
         "exception": "cpp",
         "expected": "cpp",
+        "filesystem": "cpp",
+        "format": "cpp",
         "forward_list": "cpp",
         "fstream": "cpp",
         "functional": "cpp",
         "future": "cpp",
         "hash_map": "cpp",
         "hash_set": "cpp",
         "initializer_list": "cpp",
+        "inplace_vector": "cpp",
         "iomanip": "cpp",
         "iosfwd": "cpp",
         "iostream": "cpp",
@@ -106,6 +116,7 @@
         "string_view": "cpp",
         "strstream": "cpp",
         "system_error": "cpp",
+        "text_encoding": "cpp",
         "thread": "cpp",
         "tuple": "cpp",
         "type_traits": "cpp",
@@ -116,14 +127,6 @@
         "utility": "cpp",
         "valarray": "cpp",
         "variant": "cpp",
-        "vector": "cpp",
-        "inplace_vector": "cpp",
-        "format": "cpp",
-        "text_encoding": "cpp",
-        "__bit_reference": "cpp",
-        "__hash_table": "cpp",
-        "__split_buffer": "cpp",
-        "__tree": "cpp",
-        "filesystem": "cpp"
+        "vector": "cpp"
     }
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Let's use a recent CMake version:
 # 3.16+ for native sanitizers support
 # 3.17+ for `FindCUDAToolkit`
+# 3.18 for BLAS::BLAS target
 # 3.25.2 for CUDA20 support
 # The good news is that Ubuntu 24.04 comes with 3.28!
 cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
@@ -59,6 +60,7 @@ if(ENABLE_CUDA)
     set(USE_NVIDIA_CCCL ON)
     set(USE_INTEL_TBB OFF) # Prioritize CUDA acceleration
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    set(USE_NVIDIA_CCCL OFF) # Can't compile CCCL w/o CUDA
     set(USE_INTEL_TBB ON) # Default to TBB on Linux without CUDA
 endif()
 
@@ -69,6 +71,7 @@ message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}")
 # Dependencies
 # ------------------------------------------------------------------------------
 find_package(Threads REQUIRED)
+find_package(BLAS REQUIRED)
 find_package(OpenMP QUIET)
 find_package(OpenCL QUIET)
 
@@ -144,14 +147,14 @@ set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
 set(CMAKE_GCC_FLAGS "${CMAKE_GCC_FLAGS} -march=native -fopenmp")
 
 add_executable(reduce_bench reduce_bench.cpp)
-target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt Threads::Threads)
+target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt Threads::Threads BLAS::BLAS)
 
 if(USE_INTEL_TBB)
-    target_link_libraries(reduce_bench tbb)
+    target_link_libraries(reduce_bench PRIVATE TBB::tbb)
 endif()
 
 if(USE_NVIDIA_CCCL)
-    target_link_libraries(reduce_bench CUDA::cudart CUDA::cublas)
+    target_link_libraries(reduce_bench PRIVATE CUDA::cudart CUDA::cublas)
 endif()
 
 if(OpenMP_FOUND)

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Examples were collected from early 2010s until 2019 and later updated in 2022.
 Install dependencies:
 
 ```sh
+sudo apt install libblas-dev            # For OpenBLAS on Linux
 sudo apt install libnuma1 libnuma-dev   # For NUMA allocators on Linux
 ```
 

diff --git a/reduce_bench.cpp b/reduce_bench.cpp
@@ -39,10 +39,12 @@
 /**
  *  Platform-specific includes:
  *  - CPU kernels with AVX2, AVX-512, and OpenMP acceleration
+ *  - BLAS kernels linking to `cblas_sdot`
  *  - CUDA kernels with CUB, Thrust, and manual implementations
  *  - OpenCL kernels with manual implementations
  *  - Dysfunctional Metal kernels for Apple devices
  */
+#include "reduce_blas.hpp"
 #include "reduce_cpu.hpp"
 
 #if defined(__OPENCL__)
@@ -325,6 +327,9 @@ int main(int argc, char **argv) {
     register_("std::accumulate/f64", stl_accumulate_gt<double> {}, dataset);
     register_("serial/f32/openmp", openmp_t {}, dataset);
 
+    //! BLAS struggles with zero-strided arguments!
+    //! register_("blas/f32", blas_dot_t {}, dataset);
+
 #if defined(__cpp_lib_execution)
     register_("std::reduce<par>/f32", stl_par_reduce_gt<float> {}, dataset);
     register_("std::reduce<par>/f64", stl_par_reduce_gt<double> {}, dataset);

diff --git a/reduce_blas.hpp b/reduce_blas.hpp
@@ -0,0 +1,39 @@
+/**
+ *  @date 19/01/2025
+ *  @file reduce_blas.gpp
+ *  @brief BLAS-based reductions
+ *  @author Ash Vardanian
+ */
+#pragma once
+#include <cblas.h>   // `cblas_sdot`
+#include <limits>    // `std::numeric_limits`
+#include <stdexcept> // `std::length_error`
+
+namespace ashvardanian::reduce {
+
+/**
+ *  @brief Using BLAS dot-product interface to accumulate a vector.
+ *
+ *  BLAS interfaces have a convenient "stride" parameter that can be used to
+ *  apply the kernel to various data layouts. Similarly, if we set the stride
+ *  to @b zero, we can fool the kernels into thinking that a scalar is a vector.
+ */
+class blas_dot_t {
+    float const *const begin_ = nullptr;
+    float const *const end_ = nullptr;
+
+  public:
+    blas_dot_t() = default;
+    blas_dot_t(float const *b, float const *e) : begin_(b), end_(e) {
+        constexpr std::size_t max_length = static_cast<std::size_t>(std::numeric_limits<blasint>::max());
+        if (end_ - begin_ > max_length) throw std::length_error("BLAS not configured for 64-bit sizes");
+    }
+
+    float operator()() const noexcept {
+        float repeated_ones[1];
+        repeated_ones[0] = 1.0f;
+        return cblas_sdot(end_ - begin_, begin_, 1, &repeated_ones[0], 0);
+    }
+};
+
+} // namespace ashvardanian::reduce
diff --git a/reduce_cublas.cuh b/reduce_cublas.cuh
@@ -18,6 +18,16 @@ using namespace nvcuda;
 
 namespace ashvardanian::reduce {
 
+/**
+ *  @brief Using cuBLAS dot-product interfaces to accumulate a vector.
+ *  @see   https://docs.nvidia.com/cuda/cublas/#cublas-t-dot
+ *
+ *  BLAS interfaces have a convenient "stride" parameter that can be used to
+ *  apply the kernel to various data layouts. Similarly, if we set the stride
+ *  to @b zero, we can fool the kernels into thinking that a scalar is a vector.
+ */
+struct cuda_blas_dot_t {};
+
 struct cuda_blas_gemm_t {
 
     // We review this input array as a wide 2D matrix.
@@ -64,8 +74,6 @@ struct cuda_blas_gemm_t {
     }
 };
 
-struct cuda_blas_dot_t {};
-
 __global__ void cu_reduce_tensors( //
     float const *inputs, unsigned int input_size, float *sums_per_row, unsigned int columns) {
-Original file line number
+Diff line change
@@ Expand Up @@
     Install dependencies:
     ```sh
+    sudo apt install libblas-dev            # For OpenBLAS on Linux
     sudo apt install libnuma1 libnuma-dev   # For NUMA allocators on Linux
     ```
@@ Expand Down @@