Skip to content

Commit

Permalink
Fix support for different input and output types in linalg::reduce (#296
Browse files Browse the repository at this point in the history
)

Closes #165 

Uses a C++17 `if constexpr` to discard at compile time a code path that doesn't support different input and output types, and adds a test for such a case (the test won't compile without that `constexpr` keyword).

Authors:
  - Louis Sugy (https://github.com/Nyrio)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)

URL: #296
  • Loading branch information
Nyrio authored Jul 21, 2021
1 parent 35411a0 commit 14ff641
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 41 deletions.
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/strided_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,

///@todo: this complication should go away once we have eliminated the need
/// for atomics in stridedKernel (redesign for this is already underway)
if (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
std::is_same<InType, OutType>::value)
if constexpr (std::is_same<ReduceLambda, raft::Sum<OutType>>::value &&
std::is_same<InType, OutType>::value)
stridedSummationKernel<InType>
<<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
else
Expand Down
81 changes: 58 additions & 23 deletions cpp/test/linalg/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,43 +25,48 @@
namespace raft {
namespace linalg {

template <typename T>
template <typename InType, typename OutType>
struct ReduceInputs {
T tolerance;
OutType tolerance;
int rows, cols;
bool rowMajor, alongRows;
unsigned long long int seed;
};

template <typename T>
::std::ostream &operator<<(::std::ostream &os, const ReduceInputs<T> &dims) {
template <typename InType, typename OutType>
::std::ostream &operator<<(::std::ostream &os,
const ReduceInputs<InType, OutType> &dims) {
return os;
}

// Or else, we get the following compilation error
// for an extended __device__ lambda cannot have private or protected access
// within its class
template <typename T>
void reduceLaunch(T *dots, const T *data, int cols, int rows, bool rowMajor,
bool alongRows, bool inplace, cudaStream_t stream) {
reduce(dots, data, cols, rows, (T)0, rowMajor, alongRows, stream, inplace,
[] __device__(T in, int i) { return in * in; });
template <typename InType, typename OutType>
void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
bool rowMajor, bool alongRows, bool inplace,
cudaStream_t stream) {
reduce(
dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
[] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
}

template <typename T>
class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
template <typename InType, typename OutType>
class ReduceTest
: public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
protected:
void SetUp() override {
CUDA_CHECK(cudaStreamCreate(&stream));
params = ::testing::TestWithParam<ReduceInputs<T>>::GetParam();
params =
::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
raft::random::Rng r(params.seed);
int rows = params.rows, cols = params.cols;
int len = rows * cols;
outlen = params.alongRows ? rows : cols;
raft::allocate(data, len);
raft::allocate(dots_exp, outlen);
raft::allocate(dots_act, outlen);
r.uniform(data, len, T(-1.0), T(1.0), stream);
r.uniform(data, len, InType(-1.0), InType(1.0), stream);
naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
params.alongRows, stream);

Expand All @@ -84,13 +89,14 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<T>> {
}

protected:
ReduceInputs<T> params;
T *data, *dots_exp, *dots_act;
ReduceInputs<InType, OutType> params;
InType *data;
OutType *dots_exp, *dots_act;
int outlen;
cudaStream_t stream;
};

const std::vector<ReduceInputs<float>> inputsf = {
const std::vector<ReduceInputs<float, float>> inputsff = {
{0.000002f, 1024, 32, true, true, 1234ULL},
{0.000002f, 1024, 64, true, true, 1234ULL},
{0.000002f, 1024, 128, true, true, 1234ULL},
Expand All @@ -108,7 +114,7 @@ const std::vector<ReduceInputs<float>> inputsf = {
{0.000002f, 1024, 128, false, false, 1234ULL},
{0.000002f, 1024, 256, false, false, 1234ULL}};

const std::vector<ReduceInputs<double>> inputsd = {
const std::vector<ReduceInputs<double, double>> inputsdd = {
{0.000000001, 1024, 32, true, true, 1234ULL},
{0.000000001, 1024, 64, true, true, 1234ULL},
{0.000000001, 1024, 128, true, true, 1234ULL},
Expand All @@ -126,21 +132,50 @@ const std::vector<ReduceInputs<double>> inputsd = {
{0.000000001, 1024, 128, false, false, 1234ULL},
{0.000000001, 1024, 256, false, false, 1234ULL}};

typedef ReduceTest<float> ReduceTestF;
TEST_P(ReduceTestF, Result) {
const std::vector<ReduceInputs<float, double>> inputsfd = {
{0.000002f, 1024, 32, true, true, 1234ULL},
{0.000002f, 1024, 64, true, true, 1234ULL},
{0.000002f, 1024, 128, true, true, 1234ULL},
{0.000002f, 1024, 256, true, true, 1234ULL},
{0.000002f, 1024, 32, true, false, 1234ULL},
{0.000002f, 1024, 64, true, false, 1234ULL},
{0.000002f, 1024, 128, true, false, 1234ULL},
{0.000002f, 1024, 256, true, false, 1234ULL},
{0.000002f, 1024, 32, false, true, 1234ULL},
{0.000002f, 1024, 64, false, true, 1234ULL},
{0.000002f, 1024, 128, false, true, 1234ULL},
{0.000002f, 1024, 256, false, true, 1234ULL},
{0.000002f, 1024, 32, false, false, 1234ULL},
{0.000002f, 1024, 64, false, false, 1234ULL},
{0.000002f, 1024, 128, false, false, 1234ULL},
{0.000002f, 1024, 256, false, false, 1234ULL}};

typedef ReduceTest<float, float> ReduceTestFF;
TEST_P(ReduceTestFF, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<float>(params.tolerance)));
}

typedef ReduceTest<double> ReduceTestD;
TEST_P(ReduceTestD, Result) {
typedef ReduceTest<double, double> ReduceTestDD;
TEST_P(ReduceTestDD, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<double>(params.tolerance)));
}

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestF, ::testing::ValuesIn(inputsf));
typedef ReduceTest<float, double> ReduceTestFD;
TEST_P(ReduceTestFD, Result) {
ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
raft::CompareApprox<double>(params.tolerance)));
}

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
::testing::ValuesIn(inputsff));

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
::testing::ValuesIn(inputsdd));

INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestD, ::testing::ValuesIn(inputsd));
INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
::testing::ValuesIn(inputsfd));

} // end namespace linalg
} // end namespace raft
35 changes: 19 additions & 16 deletions cpp/test/linalg/reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#pragma once

#include <cublas_v2.h>
#include <raft/linalg/cublas_wrappers.h>
#include <thrust/device_vector.h>
Expand All @@ -23,52 +25,53 @@
namespace raft {
namespace linalg {

template <typename Type>
__global__ void naiveCoalescedReductionKernel(Type *dots, const Type *data,
template <typename InType, typename OutType>
__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
int D, int N) {
Type acc = (Type)0;
OutType acc = (OutType)0;
int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
if (rowStart < N) {
for (int i = 0; i < D; ++i) {
acc += data[rowStart * D + i] * data[rowStart * D + i];
acc +=
static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
}
dots[rowStart] = 2 * acc;
}
}

template <typename Type>
void naiveCoalescedReduction(Type *dots, const Type *data, int D, int N,
template <typename InType, typename OutType>
void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
cudaStream_t stream) {
static const int TPB = 64;
int nblks = raft::ceildiv(N, TPB);
naiveCoalescedReductionKernel<Type>
naiveCoalescedReductionKernel<InType, OutType>
<<<nblks, TPB, 0, stream>>>(dots, data, D, N);
CUDA_CHECK(cudaPeekAtLastError());
}

template <typename Type>
void unaryAndGemv(Type *dots, const Type *data, int D, int N,
template <typename InType, typename OutType>
void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
cudaStream_t stream) {
//computes a MLCommon unary op on data (squares it), then computes Ax
//(A input matrix and x column vector) to sum columns
thrust::device_vector<Type> sq(D * N);
thrust::device_vector<OutType> sq(D * N);
raft::linalg::unaryOp(
thrust::raw_pointer_cast(sq.data()), data, D * N,
[] __device__(Type v) { return v * v; }, stream);
[] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
thrust::device_vector<Type> ones(N, 1); //column vector [1...1]
Type alpha = 1, beta = 0;
thrust::device_vector<OutType> ones(N, 1); //column vector [1...1]
OutType alpha = 1, beta = 0;
CUBLAS_CHECK(raft::linalg::cublasgemv(
handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
CUDA_CHECK(cudaDeviceSynchronize());
CUBLAS_CHECK(cublasDestroy(handle));
}

template <typename Type>
void naiveReduction(Type *dots, const Type *data, int D, int N, bool rowMajor,
bool alongRows, cudaStream_t stream) {
template <typename InType, typename OutType>
void naiveReduction(OutType *dots, const InType *data, int D, int N,
bool rowMajor, bool alongRows, cudaStream_t stream) {
if (rowMajor && alongRows) {
naiveCoalescedReduction(dots, data, D, N, stream);
} else if (rowMajor && !alongRows) {
Expand Down

0 comments on commit 14ff641

Please sign in to comment.