Skip to content

Commit

Permalink
Fix illegal acces mean/stdev, sum add Kahan Summation (#2223)
Browse files Browse the repository at this point in the history
This PR addresses #2204 and #2205. 


* fixes illegal access / test coverage for mean row-wise kernel
* fixes illegal access / test coverage for stdev row-wise kernel
* modified sum kernels to utilize Kahan/Neumaier summation per thread, also increase load per thread to benefit from this

FYI, @tfeher

Authors:
  - Malte Förster (https://github.com/mfoerste4)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: #2223
  • Loading branch information
mfoerste4 authored Mar 16, 2024
1 parent 36484f4 commit 7335267
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 126 deletions.
4 changes: 2 additions & 2 deletions cpp/include/raft/stats/detail/mean.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,7 +43,7 @@ RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
__syncthreads();
raft::myAtomicAdd(smu + thisColId, thread_data);
__syncthreads();
if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]);
if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
}

template <typename Type, typename IdxType, int TPB>
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/stats/detail/stddev.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType
__syncthreads();
raft::myAtomicAdd(sstd + thisColId, thread_data);
__syncthreads();
if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(std + colId, sstd[thisColId]);
if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]);
}

template <typename Type, typename IdxType, int TPB>
Expand Down
78 changes: 63 additions & 15 deletions cpp/include/raft/stats/detail/sum.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,72 @@ RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
IdxType thisRowId = threadIdx.x / ColsPerBlk;
IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
Type thread_data = Type(0);
Type thread_sum = Type(0);
const IdxType stride = RowsPerBlkPerIter * gridDim.x;
for (IdxType i = rowId; i < N; i += stride)
thread_data += (colId < D) ? data[i * D + colId] : Type(0);
for (IdxType i = rowId; i < N; i += stride) {
thread_sum += (colId < D) ? data[i * D + colId] : Type(0);
}
__shared__ Type smu[ColsPerBlk];
if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
__syncthreads();
raft::myAtomicAdd(smu + thisColId, thread_data);
raft::myAtomicAdd(smu + thisColId, thread_sum);
__syncthreads();
if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
}

template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
{
constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
IdxType thisColId = threadIdx.x % ColsPerBlk;
IdxType thisRowId = threadIdx.x / ColsPerBlk;
IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
Type thread_sum = Type(0);
Type thread_c = Type(0);
const IdxType stride = RowsPerBlkPerIter * gridDim.x;
for (IdxType i = rowId; i < N; i += stride) {
// KahanBabushkaNeumaierSum
const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0);
const Type t = thread_sum + cur_value;
if (abs(thread_sum) >= abs(cur_value)) {
thread_c += (thread_sum - t) + cur_value;
} else {
thread_c += (cur_value - t) + thread_sum;
}
thread_sum = t;
}
thread_sum += thread_c;
__shared__ Type smu[ColsPerBlk];
if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
__syncthreads();
raft::myAtomicAdd(smu + thisColId, thread_sum);
__syncthreads();
if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
}

template <typename Type, typename IdxType, int TPB>
RAFT_KERNEL sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
{
typedef cub::BlockReduce<Type, TPB> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
Type thread_data = Type(0);
Type thread_sum = Type(0);
Type thread_c = Type(0);
IdxType colStart = N * blockIdx.x;
for (IdxType i = threadIdx.x; i < N; i += TPB) {
IdxType idx = colStart + i;
thread_data += data[idx];
// KahanBabushkaNeumaierSum
IdxType idx = colStart + i;
const Type cur_value = data[idx];
const Type t = thread_sum + cur_value;
if (abs(thread_sum) >= abs(cur_value)) {
thread_c += (thread_sum - t) + cur_value;
} else {
thread_c += (cur_value - t) + thread_sum;
}
thread_sum = t;
}
Type acc = BlockReduce(temp_storage).Sum(thread_data);
thread_sum += thread_c;
Type acc = BlockReduce(temp_storage).Sum(thread_sum);
if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
}

Expand All @@ -66,15 +108,21 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
{
static const int TPB = 256;
if (rowMajor) {
static const int RowsPerThread = 4;
static const int ColsPerBlk = 32;
static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
static const int ColsPerBlk = 8;
static const int MinRowsPerThread = 16;
static const int MinRowsPerBlk = (TPB / ColsPerBlk) * MinRowsPerThread;
static const int MaxBlocksDimX = 8192;

const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk);
const IdxType grid_x =
raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk));

dim3 grid(grid_x, grid_y);
RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D));
sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
sumKahanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
<<<grid, TPB, 0, stream>>>(output, input, D, N);
} else {
sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
sumKahanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
}
RAFT_CUDA_TRY(cudaPeekAtLastError());
}
Expand Down
66 changes: 33 additions & 33 deletions cpp/test/stats/mean.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,39 +95,39 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
// Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
// measured mean (of a normal distribution) will fall outside of an epsilon of
// 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
{0.15f, 1.f, 1024, 64, true, false, 1234ULL},
{0.15f, 1.f, 1024, 128, true, false, 1234ULL},
{0.15f, 1.f, 1024, 256, true, false, 1234ULL},
{0.15f, -1.f, 1024, 32, false, false, 1234ULL},
{0.15f, -1.f, 1024, 64, false, false, 1234ULL},
{0.15f, -1.f, 1024, 128, false, false, 1234ULL},
{0.15f, -1.f, 1024, 256, false, false, 1234ULL},
{0.15f, 1.f, 1024, 32, true, true, 1234ULL},
{0.15f, 1.f, 1024, 64, true, true, 1234ULL},
{0.15f, 1.f, 1024, 128, true, true, 1234ULL},
{0.15f, 1.f, 1024, 256, true, true, 1234ULL},
{0.15f, -1.f, 1024, 32, false, true, 1234ULL},
{0.15f, -1.f, 1024, 64, false, true, 1234ULL},
{0.15f, -1.f, 1024, 128, false, true, 1234ULL},
{0.15f, -1.f, 1024, 256, false, true, 1234ULL}};

const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
{0.15, 1.0, 1024, 64, true, false, 1234ULL},
{0.15, 1.0, 1024, 128, true, false, 1234ULL},
{0.15, 1.0, 1024, 256, true, false, 1234ULL},
{0.15, -1.0, 1024, 32, false, false, 1234ULL},
{0.15, -1.0, 1024, 64, false, false, 1234ULL},
{0.15, -1.0, 1024, 128, false, false, 1234ULL},
{0.15, -1.0, 1024, 256, false, false, 1234ULL},
{0.15, 1.0, 1024, 32, true, true, 1234ULL},
{0.15, 1.0, 1024, 64, true, true, 1234ULL},
{0.15, 1.0, 1024, 128, true, true, 1234ULL},
{0.15, 1.0, 1024, 256, true, true, 1234ULL},
{0.15, -1.0, 1024, 32, false, true, 1234ULL},
{0.15, -1.0, 1024, 64, false, true, 1234ULL},
{0.15, -1.0, 1024, 128, false, true, 1234ULL},
{0.15, -1.0, 1024, 256, false, true, 1234ULL}};
const std::vector<MeanInputs<float>> inputsf = {
{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
{0.15f, 1.f, 1024, 128, true, false, 1234ULL}, {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
{0.15f, -1.f, 1024, 32, false, false, 1234ULL}, {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
{0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
{0.15f, 1.f, 1024, 32, true, true, 1234ULL}, {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
{0.15f, 1.f, 1024, 128, true, true, 1234ULL}, {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
{0.15f, -1.f, 1024, 32, false, true, 1234ULL}, {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
{0.15f, -1.f, 1024, 128, false, true, 1234ULL}, {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
{0.15f, -1.f, 1030, 1, false, false, 1234ULL}, {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
{2.0f, -1.f, 31, 120, false, false, 1234ULL}, {2.0f, -1.f, 1, 130, true, false, 1234ULL},
{0.15f, -1.f, 1030, 1, false, true, 1234ULL}, {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
{2.0f, -1.f, 31, 120, false, true, 1234ULL}, {2.0f, -1.f, 1, 130, false, true, 1234ULL},
{2.0f, -1.f, 1, 1, false, false, 1234ULL}, {2.0f, -1.f, 1, 1, false, true, 1234ULL},
{2.0f, -1.f, 7, 23, false, false, 1234ULL}, {2.0f, -1.f, 7, 23, false, true, 1234ULL},
{2.0f, -1.f, 17, 5, false, false, 1234ULL}, {2.0f, -1.f, 17, 5, false, true, 1234ULL}};

const std::vector<MeanInputs<double>> inputsd = {
{0.15, 1.0, 1024, 32, true, false, 1234ULL}, {0.15, 1.0, 1024, 64, true, false, 1234ULL},
{0.15, 1.0, 1024, 128, true, false, 1234ULL}, {0.15, 1.0, 1024, 256, true, false, 1234ULL},
{0.15, -1.0, 1024, 32, false, false, 1234ULL}, {0.15, -1.0, 1024, 64, false, false, 1234ULL},
{0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL},
{0.15, 1.0, 1024, 32, true, true, 1234ULL}, {0.15, 1.0, 1024, 64, true, true, 1234ULL},
{0.15, 1.0, 1024, 128, true, true, 1234ULL}, {0.15, 1.0, 1024, 256, true, true, 1234ULL},
{0.15, -1.0, 1024, 32, false, true, 1234ULL}, {0.15, -1.0, 1024, 64, false, true, 1234ULL},
{0.15, -1.0, 1024, 128, false, true, 1234ULL}, {0.15, -1.0, 1024, 256, false, true, 1234ULL},
{0.15, -1.0, 1030, 1, false, false, 1234ULL}, {0.15, -1.0, 1030, 60, true, false, 1234ULL},
{2.0, -1.0, 31, 120, false, false, 1234ULL}, {2.0, -1.0, 1, 130, true, false, 1234ULL},
{0.15, -1.0, 1030, 1, false, true, 1234ULL}, {0.15, -1.0, 1030, 60, true, true, 1234ULL},
{2.0, -1.0, 31, 120, false, true, 1234ULL}, {2.0, -1.0, 1, 130, false, true, 1234ULL},
{2.0, -1.0, 1, 1, false, false, 1234ULL}, {2.0, -1.0, 1, 1, false, true, 1234ULL},
{2.0, -1.0, 7, 23, false, false, 1234ULL}, {2.0, -1.0, 7, 23, false, true, 1234ULL},
{2.0, -1.0, 17, 5, false, false, 1234ULL}, {2.0, -1.0, 17, 5, false, true, 1234ULL}};

typedef MeanTest<float> MeanTestF;
TEST_P(MeanTestF, Result)
Expand Down
66 changes: 27 additions & 39 deletions cpp/test/stats/minmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -145,45 +145,33 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
rmm::device_uvector<T> minmax_ref;
};

const std::vector<MinMaxInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
{0.00001f, 1024, 64, 1234ULL},
{0.00001f, 1024, 128, 1234ULL},
{0.00001f, 1024, 256, 1234ULL},
{0.00001f, 1024, 512, 1234ULL},
{0.00001f, 1024, 1024, 1234ULL},
{0.00001f, 4096, 32, 1234ULL},
{0.00001f, 4096, 64, 1234ULL},
{0.00001f, 4096, 128, 1234ULL},
{0.00001f, 4096, 256, 1234ULL},
{0.00001f, 4096, 512, 1234ULL},
{0.00001f, 4096, 1024, 1234ULL},
{0.00001f, 8192, 32, 1234ULL},
{0.00001f, 8192, 64, 1234ULL},
{0.00001f, 8192, 128, 1234ULL},
{0.00001f, 8192, 256, 1234ULL},
{0.00001f, 8192, 512, 1234ULL},
{0.00001f, 8192, 1024, 1234ULL},
{0.00001f, 1024, 8192, 1234ULL}};

const std::vector<MinMaxInputs<double>> inputsd = {{0.0000001, 1024, 32, 1234ULL},
{0.0000001, 1024, 64, 1234ULL},
{0.0000001, 1024, 128, 1234ULL},
{0.0000001, 1024, 256, 1234ULL},
{0.0000001, 1024, 512, 1234ULL},
{0.0000001, 1024, 1024, 1234ULL},
{0.0000001, 4096, 32, 1234ULL},
{0.0000001, 4096, 64, 1234ULL},
{0.0000001, 4096, 128, 1234ULL},
{0.0000001, 4096, 256, 1234ULL},
{0.0000001, 4096, 512, 1234ULL},
{0.0000001, 4096, 1024, 1234ULL},
{0.0000001, 8192, 32, 1234ULL},
{0.0000001, 8192, 64, 1234ULL},
{0.0000001, 8192, 128, 1234ULL},
{0.0000001, 8192, 256, 1234ULL},
{0.0000001, 8192, 512, 1234ULL},
{0.0000001, 8192, 1024, 1234ULL},
{0.0000001, 1024, 8192, 1234ULL}};
const std::vector<MinMaxInputs<float>> inputsf = {
{0.00001f, 1024, 32, 1234ULL}, {0.00001f, 1024, 64, 1234ULL}, {0.00001f, 1024, 128, 1234ULL},
{0.00001f, 1024, 256, 1234ULL}, {0.00001f, 1024, 512, 1234ULL}, {0.00001f, 1024, 1024, 1234ULL},
{0.00001f, 4096, 32, 1234ULL}, {0.00001f, 4096, 64, 1234ULL}, {0.00001f, 4096, 128, 1234ULL},
{0.00001f, 4096, 256, 1234ULL}, {0.00001f, 4096, 512, 1234ULL}, {0.00001f, 4096, 1024, 1234ULL},
{0.00001f, 8192, 32, 1234ULL}, {0.00001f, 8192, 64, 1234ULL}, {0.00001f, 8192, 128, 1234ULL},
{0.00001f, 8192, 256, 1234ULL}, {0.00001f, 8192, 512, 1234ULL}, {0.00001f, 8192, 1024, 1234ULL},
{0.00001f, 1024, 8192, 1234ULL}, {0.00001f, 1023, 5, 1234ULL}, {0.00001f, 1025, 30, 1234ULL},
{0.00001f, 2047, 65, 1234ULL}, {0.00001f, 2049, 22, 1234ULL}, {0.00001f, 31, 644, 1234ULL},
{0.00001f, 33, 999, 1234ULL}, {0.00001f, 1, 1, 1234ULL}, {0.00001f, 7, 23, 1234ULL},
{0.00001f, 17, 5, 1234ULL}};

const std::vector<MinMaxInputs<double>> inputsd = {
{0.0000001, 1024, 32, 1234ULL}, {0.0000001, 1024, 64, 1234ULL},
{0.0000001, 1024, 128, 1234ULL}, {0.0000001, 1024, 256, 1234ULL},
{0.0000001, 1024, 512, 1234ULL}, {0.0000001, 1024, 1024, 1234ULL},
{0.0000001, 4096, 32, 1234ULL}, {0.0000001, 4096, 64, 1234ULL},
{0.0000001, 4096, 128, 1234ULL}, {0.0000001, 4096, 256, 1234ULL},
{0.0000001, 4096, 512, 1234ULL}, {0.0000001, 4096, 1024, 1234ULL},
{0.0000001, 8192, 32, 1234ULL}, {0.0000001, 8192, 64, 1234ULL},
{0.0000001, 8192, 128, 1234ULL}, {0.0000001, 8192, 256, 1234ULL},
{0.0000001, 8192, 512, 1234ULL}, {0.0000001, 8192, 1024, 1234ULL},
{0.0000001, 1024, 8192, 1234ULL}, {0.0000001, 1023, 5, 1234ULL},
{0.0000001, 1025, 30, 1234ULL}, {0.0000001, 2047, 65, 1234ULL},
{0.0000001, 2049, 22, 1234ULL}, {0.0000001, 31, 644, 1234ULL},
{0.0000001, 33, 999, 1234ULL}, {0.0000001, 1, 1, 1234ULL},
{0.0000001, 7, 23, 1234ULL}, {0.0000001, 17, 5, 1234ULL}};

typedef MinMaxTest<float> MinMaxTestF;
TEST_P(MinMaxTestF, Result)
Expand Down
Loading

0 comments on commit 7335267

Please sign in to comment.