Skip to content

Commit

Permalink
Add: BLAS with zero-stride
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Jan 19, 2025
1 parent 8d7f61d commit c39768e
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 14 deletions.
21 changes: 12 additions & 9 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"cSpell.words": [
"ashvardanian",
"blas",
"blasint",
"cblas",
"CCCL",
"constexpr",
"cublas",
Expand All @@ -27,8 +29,13 @@
"*.ipp": "cpp",
"*.metal": "cpp",
"*.tcc": "cpp",
"__bit_reference": "cpp",
"__config": "cpp",
"__debug": "cpp",
"__hash_table": "cpp",
"__nullptr": "cpp",
"__split_buffer": "cpp",
"__tree": "cpp",
"algorithm": "cpp",
"any": "cpp",
"array": "cpp",
Expand Down Expand Up @@ -63,13 +70,16 @@
"deque": "cpp",
"exception": "cpp",
"expected": "cpp",
"filesystem": "cpp",
"format": "cpp",
"forward_list": "cpp",
"fstream": "cpp",
"functional": "cpp",
"future": "cpp",
"hash_map": "cpp",
"hash_set": "cpp",
"initializer_list": "cpp",
"inplace_vector": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
Expand Down Expand Up @@ -106,6 +116,7 @@
"string_view": "cpp",
"strstream": "cpp",
"system_error": "cpp",
"text_encoding": "cpp",
"thread": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
Expand All @@ -116,14 +127,6 @@
"utility": "cpp",
"valarray": "cpp",
"variant": "cpp",
"vector": "cpp",
"inplace_vector": "cpp",
"format": "cpp",
"text_encoding": "cpp",
"__bit_reference": "cpp",
"__hash_table": "cpp",
"__split_buffer": "cpp",
"__tree": "cpp",
"filesystem": "cpp"
"vector": "cpp"
}
}
9 changes: 6 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Let's use a recent CMake version:
# 3.16+ for native sanitizers support
# 3.17+ for `FindCUDAToolkit`
# 3.18 for BLAS::BLAS target
# 3.25.2 for CUDA20 support
# The good news is that Ubuntu 24.04 comes with 3.28!
cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
Expand Down Expand Up @@ -59,6 +60,7 @@ if(ENABLE_CUDA)
set(USE_NVIDIA_CCCL ON)
set(USE_INTEL_TBB OFF) # Prioritize CUDA acceleration
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(USE_NVIDIA_CCCL OFF) # Can't compile CCCL w/o CUDA
set(USE_INTEL_TBB ON) # Default to TBB on Linux without CUDA
endif()

Expand All @@ -69,6 +71,7 @@ message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}")
# Dependencies
# ------------------------------------------------------------------------------
find_package(Threads REQUIRED)
find_package(BLAS REQUIRED)
find_package(OpenMP QUIET)
find_package(OpenCL QUIET)

Expand Down Expand Up @@ -144,14 +147,14 @@ set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
set(CMAKE_GCC_FLAGS "${CMAKE_GCC_FLAGS} -march=native -fopenmp")

add_executable(reduce_bench reduce_bench.cpp)
target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt Threads::Threads)
target_link_libraries(reduce_bench PRIVATE benchmark::benchmark fmt::fmt Threads::Threads BLAS::BLAS)

if(USE_INTEL_TBB)
target_link_libraries(reduce_bench tbb)
target_link_libraries(reduce_bench PRIVATE TBB::tbb)
endif()

if(USE_NVIDIA_CCCL)
target_link_libraries(reduce_bench CUDA::cudart CUDA::cublas)
target_link_libraries(reduce_bench PRIVATE CUDA::cudart CUDA::cublas)
endif()

if(OpenMP_FOUND)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Examples were collected from early 2010s until 2019 and later updated in 2022.
Install dependencies:

```sh
sudo apt install libblas-dev # For OpenBLAS on Linux
sudo apt install libnuma1 libnuma-dev # For NUMA allocators on Linux
```

Expand Down
5 changes: 5 additions & 0 deletions reduce_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@
/**
* Platform-specific includes:
* - CPU kernels with AVX2, AVX-512, and OpenMP acceleration
* - BLAS kernels linking to `cblas_sdot`
* - CUDA kernels with CUB, Thrust, and manual implementations
* - OpenCL kernels with manual implementations
* - Dysfunctional Metal kernels for Apple devices
*/
#include "reduce_blas.hpp"
#include "reduce_cpu.hpp"

#if defined(__OPENCL__)
Expand Down Expand Up @@ -325,6 +327,9 @@ int main(int argc, char **argv) {
register_("std::accumulate/f64", stl_accumulate_gt<double> {}, dataset);
register_("serial/f32/openmp", openmp_t {}, dataset);

//! BLAS struggles with zero-strided arguments!
//! register_("blas/f32", blas_dot_t {}, dataset);

#if defined(__cpp_lib_execution)
register_("std::reduce<par>/f32", stl_par_reduce_gt<float> {}, dataset);
register_("std::reduce<par>/f64", stl_par_reduce_gt<double> {}, dataset);
Expand Down
39 changes: 39 additions & 0 deletions reduce_blas.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* @date 19/01/2025
* @file reduce_blas.gpp
* @brief BLAS-based reductions
* @author Ash Vardanian
*/
#pragma once
#include <cblas.h> // `cblas_sdot`
#include <limits> // `std::numeric_limits`
#include <stdexcept> // `std::length_error`

namespace ashvardanian::reduce {

/**
* @brief Using BLAS dot-product interface to accumulate a vector.
*
* BLAS interfaces have a convenient "stride" parameter that can be used to
* apply the kernel to various data layouts. Similarly, if we set the stride
* to @b zero, we can fool the kernels into thinking that a scalar is a vector.
*/
class blas_dot_t {
float const *const begin_ = nullptr;
float const *const end_ = nullptr;

public:
blas_dot_t() = default;
blas_dot_t(float const *b, float const *e) : begin_(b), end_(e) {
constexpr std::size_t max_length = static_cast<std::size_t>(std::numeric_limits<blasint>::max());
if (end_ - begin_ > max_length) throw std::length_error("BLAS not configured for 64-bit sizes");
}

float operator()() const noexcept {
float repeated_ones[1];
repeated_ones[0] = 1.0f;
return cblas_sdot(end_ - begin_, begin_, 1, &repeated_ones[0], 0);
}
};

} // namespace ashvardanian::reduce
12 changes: 10 additions & 2 deletions reduce_cublas.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ using namespace nvcuda;

namespace ashvardanian::reduce {

/**
* @brief Using cuBLAS dot-product interfaces to accumulate a vector.
* @see https://docs.nvidia.com/cuda/cublas/#cublas-t-dot
*
* BLAS interfaces have a convenient "stride" parameter that can be used to
* apply the kernel to various data layouts. Similarly, if we set the stride
* to @b zero, we can fool the kernels into thinking that a scalar is a vector.
*/
struct cuda_blas_dot_t {};

struct cuda_blas_gemm_t {

// We review this input array as a wide 2D matrix.
Expand Down Expand Up @@ -64,8 +74,6 @@ struct cuda_blas_gemm_t {
}
};

struct cuda_blas_dot_t {};

__global__ void cu_reduce_tensors( //
float const *inputs, unsigned int input_size, float *sums_per_row, unsigned int columns) {

Expand Down

0 comments on commit c39768e

Please sign in to comment.