From 3a4ec66f46b38314fbe990f3a12f83c83e75c9b2 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 12 May 2021 22:03:33 +0530 Subject: [PATCH 01/17] Refactor fusedL2NN to use pairwiseDistance class. invert block y/x dir usage in all contraction based kernels so that n is along x dir and m is along y dir blocks --- cpp/include/raft/distance/cosine.cuh | 4 +- cpp/include/raft/distance/euclidean.cuh | 9 +- cpp/include/raft/distance/fused_l2_nn.cuh | 298 ++++-------------- cpp/include/raft/distance/l1.cuh | 4 +- .../raft/distance/pairwise_distance_base.cuh | 30 +- cpp/include/raft/linalg/contractions.cuh | 8 +- 6 files changed, 100 insertions(+), 253 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index 5a212ce64c..a1793777f2 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -61,8 +61,8 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(m, KPolicy::Mblk), - raft::ceildiv(n, KPolicy::Nblk)); + dim3 grid(raft::ceildiv(n, KPolicy::Nblk), + raft::ceildiv(m, KPolicy::Mblk)); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index f3f946ad7b..76a721c202 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -60,8 +60,8 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(m, KPolicy::Mblk), - raft::ceildiv(n, KPolicy::Nblk)); + dim3 grid(raft::ceildiv(n, KPolicy::Nblk), + raft::ceildiv(m, KPolicy::Mblk)); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -229,8 +229,9 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(m, KPolicy::Mblk), - raft::ceildiv(n, KPolicy::Nblk)); + + dim3 grid(raft::ceildiv(n, KPolicy::Nblk), + raft::ceildiv(m, KPolicy::Mblk)); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index 000d856841..2dd32d93bc 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -21,6 +21,7 @@ #include #include #include +#include namespace raft { namespace distance { @@ -68,120 +69,52 @@ struct MinReduceOp { DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; -template > -struct FusedL2NN : public BaseClass { - private: - typedef Policy P; - - const DataT* xn; - const DataT* yn; - OutT* min; - int* mutex; - - DataT *sxNorm, *syNorm; - cub::KeyValuePair* sRed; - - DataT maxVal; - - DataT acc[P::AccRowsPerTh][P::AccColsPerTh]; - - ReduceOpT redOp; - KVPReduceOpT pairRedOp; - -#if (ENABLE_MEMCPY_ASYNC == 1) - DataT zeros[P::Veclen]; - nvcuda::experimental::pipeline pipe; -#endif - - static const DataT Two = (DataT)2.0; - static constexpr size_t SizeAndAlign = P::Veclen * sizeof(DataT); - - public: - DI FusedL2NN(OutT* _min, const DataT* _x, const DataT* _y, const DataT* _xn, - const DataT* _yn, IdxT _m, IdxT _n, IdxT _k, char* _smem, - DataT _mv, int* _mut, ReduceOpT op, KVPReduceOpT pair_op) - : BaseClass(_x, _y, _m, _n, _k, _smem), - xn(_xn), - yn(_yn), - min(_min), - mutex(_mut), - sxNorm((DataT*)_smem), - syNorm(&(sxNorm[P::Mblk])), - sRed((cub::KeyValuePair*)_smem), - maxVal(_mv), - redOp(op), - pairRedOp(pair_op) { -#if (ENABLE_MEMCPY_ASYNC == 1) -#pragma unroll - for (int i = 0; i < P::Veclen; ++i) { - zeros[i] = BaseClass::Zero; - } -#endif - } - - DI void run() { - prolog(); - loop(); - __syncthreads(); // so that we can safely reuse smem - epilog(); - } - - private: - DI void prolog() { - this->ldgXY(0); -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = BaseClass::Zero; - } - } - this->stsXY(); - __syncthreads(); - this->pageWr ^= 1; +template +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { + auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; + if (tid < m) { + redOp.init(min + tid, maxVal); } +} - DI void loop() { - for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { - this->ldgXY(kidx); - accumulate(); // on the previous k-block - this->stsXY(); - __syncthreads(); - this->pageWr ^= 1; - this->pageRd ^= 1; - } - accumulate(); // last iteration - } +template +void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, + const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, + ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, + bool initOutBuffer, cudaStream_t stream) { + typedef typename linalg::Policy4x4::Policy P; + dim3 grid(raft::ceildiv(n, P::Nblk), + raft::ceildiv(m, P::Mblk)); + dim3 blk(P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); + constexpr auto maxVal = std::numeric_limits::max(); + typedef cub::KeyValuePair KVPair; + + // Accumulation operation lambda + auto core_lambda = [] __device__(DataT &acc, DataT & x, DataT & y) { + acc += x * y; + }; + + int *mutex = workspace; + // epilogue operation lambda for final value calculation + auto epilog_lambda = [sqrt, min, mutex, m, n, redOp, pairRedOp] __device__( + DataT acc[P::AccRowsPerTh][P::AccColsPerTh], + DataT * regxn, DataT * regyn) { + extern __shared__ char smem[]; + KVPair *sRed = (KVPair*)smem; + + ReduceOpT red_op(redOp); + KVPReduceOpT pairRed_op(pairRedOp); - DI void epilog() { - for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = blockIdx.x * P::Mblk + i; - sxNorm[i] = idx < this->m ? xn[idx] : maxVal; - } - for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = blockIdx.y * P::Nblk + i; - syNorm[i] = idx < this->n ? yn[idx] : maxVal; - } - __syncthreads(); - DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh]; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - regxn[i] = sxNorm[i * P::AccThRows + this->accrowid]; - } -#pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { - regyn[i] = syNorm[i * P::AccThCols + this->acccolid]; - } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = regxn[i] + regyn[j] - Two * acc[i][j]; + acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; } } - if (Sqrt) { + if (sqrt) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll @@ -190,175 +123,82 @@ struct FusedL2NN : public BaseClass { } } } + // reduce - cub::KeyValuePair val[P::AccRowsPerTh]; + KVPair val[P::AccRowsPerTh]; auto lid = raft::laneId(); + const auto acccolid = threadIdx.x % P::AccThCols; + const auto accrowid = threadIdx.x / P::AccThCols; #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { val[i] = {-1, maxVal}; #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { - auto tmpkey = this->acccolid + j * P::AccThCols + blockIdx.y * P::Nblk; - cub::KeyValuePair tmp = {tmpkey, acc[i][j]}; - if (tmpkey < this->n) + auto tmpkey = acccolid + j * P::AccThCols + blockIdx.x * P::Nblk; + KVPair tmp = {tmpkey, acc[i][j]}; + if (tmpkey < n) val[i] = - pairRedOp(this->accrowid + i * P::AccThRows + blockIdx.x * P::Mblk, + pairRed_op(accrowid + i * P::AccThRows + blockIdx.y * P::Mblk, tmp, val[i]); } - __syncthreads(); #pragma unroll for (int j = P::AccThCols / 2; j > 0; j >>= 1) { auto tmpkey = raft::shfl(val[i].key, lid + j); auto tmpvalue = raft::shfl(val[i].value, lid + j); - cub::KeyValuePair tmp = {tmpkey, tmpvalue}; + KVPair tmp = {tmpkey, tmpvalue}; val[i] = - pairRedOp(this->accrowid + i * P::AccThRows + blockIdx.x * P::Mblk, + pairRed_op(accrowid + i * P::AccThRows + blockIdx.y * P::Mblk, tmp, val[i]); } } + __syncthreads(); if (lid % P::AccThCols == 0) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - sRed[i * P::AccThCols + this->accrowid] = val[i]; + sRed[i * P::AccThCols + accrowid] = val[i]; } } __syncthreads(); - updateResults(); - } - - /* - * todo: From Volta onwards see if "coalesced" atomicCAS approach as - * written below helps improve perf - * ``` - * auto tid = threadIdx.x; - * auto rid = IdxT(blockIdx.x) * P::Mblk + tid; - * if (rid < m) { - * auto val = sRed[i]; - * while (atomicCAS(mutex + rid, 0, 1) == 1) - * ; - * __threadfence(); - * redOp(rid, min + rid, val); - * __threadfence(); - * atomicCAS(mutex + rid, 1, 0); - * } - * ``` - */ - DI void updateResults() { // for now have first lane from each warp update a unique output row. This // will resolve hang issues with pre-Volta architectures auto nWarps = blockDim.x / raft::WarpSize; - auto lid = raft::laneId(); - auto ridx = IdxT(blockIdx.x) * P::Mblk; + auto ridx = IdxT(blockIdx.y) * P::Mblk; if (lid == 0) { for (int i = threadIdx.x / raft::WarpSize; i < P::Mblk; i += nWarps) { auto rid = ridx + i; - if (rid < this->m) { + if (rid < m) { auto val = sRed[i]; while (atomicCAS(mutex + rid, 0, 1) == 1) ; __threadfence(); - redOp(rid, min + rid, val); + red_op(rid, min + rid, val); __threadfence(); atomicCAS(mutex + rid, 1, 0); } } } - } - - DI void accumulate() { -#pragma unroll - for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { - this->ldsXY(ki); -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { -#pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - acc[i][j] += this->regx[i][v] * this->regy[j][v]; - } - } - } - } - } - -#if (ENABLE_MEMCPY_ASYNC == 1) - DI void ldgXY(IdxT kidx) { - auto koffset = kidx + this->scolid; - auto offset = - this->pageWr * P::SmemPage + this->srowid * P::SmemStride + this->scolid; - auto* saddrx = this->sx + offset; - for (int i = 0; i < P::LdgPerThX; ++i) { - auto* sax = saddrx + i * P::LdgRowsX * P::SmemStride; - auto* gax = this->x + i * P::LdgRowsX * this->k + koffset; - auto inside = - koffset < this->k && (this->xrowid + i * P::LdgRowsX) < this->m; - __pipeline_memcpy_async(sax, inside ? gax : nullptr, SizeAndAlign, - inside ? 0 : SizeAndAlign); - } - auto* saddry = this->sy + offset; - for (int i = 0; i < P::LdgPerThY; ++i) { - auto* say = saddry + i * P::LdgRowsY * P::SmemStride; - auto* gay = this->y + i * P::LdgRowsY * this->k + koffset; - auto inside = - koffset < this->k && (this->yrowid + i * P::LdgRowsY) < this->n; - __pipeline_memcpy_async(say, inside ? gay : nullptr, SizeAndAlign, - inside ? 0 : SizeAndAlign); - } - pipe.commit(); - } + }; - DI void stsXY() { pipe.wait_prior<0>(); } -#endif // ENABLE_MEMCPY_ASYNC -}; // struct FusedL2NN - -template -__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2NNkernel( - OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, - IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, - KVPReduceOpT pairRedOp) { - extern __shared__ char smem[]; - FusedL2NN obj( - min, x, y, xn, yn, m, n, k, smem, maxVal, mutex, redOp, pairRedOp); - obj.run(); -} - -template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { - auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { - redOp.init(min + tid, maxVal); - } -} - -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { - typedef typename linalg::Policy4x4::Policy Policy; - dim3 grid(raft::ceildiv(m, Policy::Mblk), - raft::ceildiv(n, Policy::Nblk)); - dim3 blk(Policy::Nthreads); - auto nblks = raft::ceildiv(m, Policy::Nthreads); - auto maxVal = std::numeric_limits::max(); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { initKernel - <<>>(min, m, maxVal, redOp); + <<>>(min, m, maxVal, redOp); CUDA_CHECK(cudaGetLastError()); } - if (sqrt) { - fusedL2NNkernel - <<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp); - } else { - fusedL2NNkernel - <<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp); - } + + IdxT lda = k, ldb = k, ldd = n; + + auto fin_op = [] __device__(DataT d_val, int g_d_idx) { + return d_val; + }; + + pairwiseDistanceMatKernel + <<>>(x, y, xn, yn, m, n, k, lda, + ldb, ldd, nullptr, core_lambda, + epilog_lambda, fin_op); + CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index ce4fbb33e3..c6232ae169 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -53,8 +53,8 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(m, KPolicy::Mblk), - raft::ceildiv(n, KPolicy::Nblk)); + dim3 grid(raft::ceildiv(n, KPolicy::Nblk), + raft::ceildiv(m, KPolicy::Mblk)); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 4e1605b887..1f49a5bdf6 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -56,6 +56,7 @@ namespace distance { template > struct PairwiseDistances : public BaseClass { @@ -147,13 +148,15 @@ struct PairwiseDistances : public BaseClass { // Load x & y norms required by this threadblock in shmem buffer for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = blockIdx.x * P::Mblk + i; + auto idx = blockIdx.y * P::Mblk + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } + for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = blockIdx.y * P::Nblk + i; + auto idx = blockIdx.x * P::Nblk + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } + __syncthreads(); DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh]; @@ -171,16 +174,18 @@ struct PairwiseDistances : public BaseClass { epilog_op(acc, nullptr, nullptr); } - IdxT startx = blockIdx.x * P::Mblk + this->accrowid; - IdxT starty = blockIdx.y * P::Nblk + this->acccolid; + if (writeOut) { + IdxT starty = blockIdx.y * P::Mblk + this->accrowid; + IdxT startx = blockIdx.x * P::Nblk + this->acccolid; #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto rowId = startx + i * P::AccThRows; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto rowId = starty + i * P::AccThRows; #pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - auto colId = starty + j * P::AccThCols; - if (rowId < this->m && colId < this->n) { - dOutput[rowId * this->n + colId] = fin_op(acc[i][j], 0); + for (int j = 0; j < P::AccColsPerTh; ++j) { + auto colId = startx + j * P::AccThCols; + if (rowId < this->m && colId < this->n) { + dOutput[rowId * this->n + colId] = fin_op(acc[i][j], 0); + } } } } @@ -219,7 +224,8 @@ struct PairwiseDistances : public BaseClass { */ template + typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true, + bool writeOut = true> __global__ __launch_bounds__( Policy::Nthreads, 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, @@ -231,7 +237,7 @@ __global__ __launch_bounds__( extern __shared__ char smem[]; PairwiseDistances + EpilogueLambda, FinalLambda, isRowMajor, writeOut> obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op); obj.run(); diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index 86d608ea87..c590abb142 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -293,13 +293,13 @@ struct Contractions_NT { pageWr(0), pageRd(0) { if (isRowMajor) { - xrowid = IdxT(blockIdx.x) * P::Mblk + srowid; - yrowid = IdxT(blockIdx.y) * P::Nblk + srowid; + xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; + yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; x = _x + xrowid * lda; y = _y + yrowid * ldb; } else { - xrowid = IdxT(blockIdx.x) * P::Mblk; - yrowid = IdxT(blockIdx.y) * P::Nblk; + xrowid = IdxT(blockIdx.y) * P::Mblk; + yrowid = IdxT(blockIdx.x) * P::Nblk; x = _x + xrowid + srowid * lda; y = _y + yrowid + srowid * ldb; } From 76f9a72b4be77fa2433bc0509002b2c388c422a2 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Mon, 17 May 2021 21:18:08 +0530 Subject: [PATCH 02/17] -- add grid stride support to pairwise distance based cosine, l2, l1 kernels. --add launch config generator function to launch optimal grid size kernel for these pairwise dist kernels --- cpp/include/raft/distance/cosine.cuh | 3 +- cpp/include/raft/distance/euclidean.cuh | 7 +- cpp/include/raft/distance/l1.cuh | 3 +- .../raft/distance/pairwise_distance_base.cuh | 87 ++++++++++++++++--- 4 files changed, 78 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index a1793777f2..a214e5cddf 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -61,8 +61,7 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(n, KPolicy::Nblk), - raft::ceildiv(m, KPolicy::Mblk)); + dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 76a721c202..69c1b1015c 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -60,8 +60,7 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(n, KPolicy::Nblk), - raft::ceildiv(m, KPolicy::Mblk)); + dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -230,8 +229,7 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(n, KPolicy::Nblk), - raft::ceildiv(m, KPolicy::Mblk)); + dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -256,6 +254,7 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { + pairwiseDistanceMatKernel diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index c6232ae169..b6439da5a9 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -53,8 +53,7 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid(raft::ceildiv(n, KPolicy::Nblk), - raft::ceildiv(m, KPolicy::Mblk)); + dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 1f49a5bdf6..4875dbed52 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -16,6 +16,8 @@ #pragma once #include #include +#include +#include namespace raft { namespace distance { @@ -64,14 +66,12 @@ struct PairwiseDistances : public BaseClass { typedef Policy P; const DataT* xn; const DataT* yn; - DataT* sxNorm; - DataT* syNorm; + const DataT *const yBase; OutT* dOutput; char* smem; CoreLambda core_op; EpilogueLambda epilog_op; FinalLambda fin_op; - AccT acc[P::AccRowsPerTh][P::AccColsPerTh]; public: @@ -82,10 +82,9 @@ struct PairwiseDistances : public BaseClass { char* _smem, CoreLambda _core_op, EpilogueLambda _epilog_op, FinalLambda _fin_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), - sxNorm((DataT*)_smem), - syNorm(&(sxNorm[P::Mblk])), xn(_xn), yn(_yn), + yBase(_y), dOutput(_dOutput), smem(_smem), core_op(_core_op), @@ -93,13 +92,54 @@ struct PairwiseDistances : public BaseClass { fin_op(_fin_op) {} DI void run() { - prolog(); - loop(); - epilog(); + + for(auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; + gridStrideY += P::Mblk * gridDim.y) { + for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; + gridStrideX += P::Nblk * gridDim.x) { + prolog(gridStrideX, gridStrideY); + loop(); + epilog(gridStrideX, gridStrideY); + } + } } private: - DI void prolog() { + DI void updateIndicesY() { + const auto stride = P::Nblk * gridDim.x; + if (isRowMajor) { + this->y += stride * this->ldb; + } else { + this->y += stride; + } + this->yrowid += stride; + this->pageWr = 0; + this->pageRd = 0; + } + + DI void updateIndicesXY() { + const auto stride = P::Mblk * gridDim.y; + if (isRowMajor) { + this->x += stride * this->lda; + this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; + this->y = yBase + this->yrowid * this->ldb; + } else { + this->x += stride; + this->yrowid = IdxT(blockIdx.x) * P::Nblk; + this->y = yBase + this->yrowid + this->srowid * this->ldb; + } + this->xrowid += stride; + this->pageWr = 0; + this->pageRd = 0; + } + + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { + if (gridStrideX > blockIdx.x * P::Nblk) { + updateIndicesY(); + } else if (gridStrideY > blockIdx.y * P::Mblk) { + updateIndicesXY(); + } + this->ldgXY(0); #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -142,18 +182,21 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog() { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { if (useNorms) { __syncthreads(); // so that we can safely reuse smem + DataT* sxNorm = (DataT*)smem; + DataT* syNorm = (&sxNorm[P::Mblk]); + // Load x & y norms required by this threadblock in shmem buffer for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = blockIdx.y * P::Mblk + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = blockIdx.x * P::Nblk + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -175,8 +218,9 @@ struct PairwiseDistances : public BaseClass { } if (writeOut) { - IdxT starty = blockIdx.y * P::Mblk + this->accrowid; - IdxT startx = blockIdx.x * P::Nblk + this->acccolid; + IdxT starty = gridStrideY + this->accrowid; + IdxT startx = gridStrideX + this->acccolid; + #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { auto rowId = starty + i * P::AccThRows; @@ -243,5 +287,20 @@ __global__ __launch_bounds__( obj.run(); } +template +dim3 launchConfigGenerator(IdxT m, IdxT n) { + const auto numSMs = raft::getMultiProcessorCount(); + // multiply by 2 as per launch bounds for pairwise dist kernels. + int minGridSize = numSMs * 2; + dim3 grid; + int yChunks = raft::ceildiv(m, P::Mblk); + int xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + grid.x = grid.x > (minGridSize + 1 - grid.y) ? (minGridSize + 1 - grid.y) : grid.x; + return grid; +} + + }; // namespace distance }; // namespace raft \ No newline at end of file From af890856742afc3a2172328a43adc2bb1319df72 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 19 May 2021 18:04:05 +0530 Subject: [PATCH 03/17] --Add grid stride based fusedL2NN kernel, this gives approx 1.67x speed up over previous version. -- improve logic of the grid launch config generator for x-dir blocks --- cpp/include/raft/distance/cosine.cuh | 3 +- cpp/include/raft/distance/euclidean.cuh | 7 +- cpp/include/raft/distance/fused_l2_nn.cuh | 191 +++++++++++------- cpp/include/raft/distance/l1.cuh | 3 +- .../raft/distance/pairwise_distance_base.cuh | 38 +++- 5 files changed, 158 insertions(+), 84 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index a214e5cddf..6f776591b6 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -72,7 +72,8 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, // epilogue operation lambda for final value calculation auto epilog_lambda = [] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn) { + DataT * regxn, DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 69c1b1015c..3f2aabb0fd 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -71,7 +71,8 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, // epilogue operation lambda for final value calculation auto epilog_lambda = [sqrt] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn) { + DataT * regxn, DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -241,7 +242,8 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, // epilogue operation lambda for final value calculation auto epilog_lambda = [sqrt] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn) { + DataT * regxn, DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -254,7 +256,6 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - pairwiseDistanceMatKernel diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index 2dd32d93bc..2d91268d52 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -77,34 +77,64 @@ __global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { } } -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { - typedef typename linalg::Policy4x4::Policy P; - dim3 grid(raft::ceildiv(n, P::Nblk), - raft::ceildiv(m, P::Mblk)); - dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); - constexpr auto maxVal = std::numeric_limits::max(); - typedef cub::KeyValuePair KVPair; +// TODO: specialize this function for MinAndDistanceReduceOp +// with atomicCAS of 64 bit which will eliminate mutex and shfls +template +DI void updateReducedVal(int *mutex, OutT *min, KVPair *val, + ReduceOpT red_op, IdxT m, IdxT gridStrideY) { + const auto lid = threadIdx.x % raft::WarpSize; + const auto accrowid = threadIdx.x / P::AccThCols; - // Accumulation operation lambda - auto core_lambda = [] __device__(DataT &acc, DataT & x, DataT & y) { - acc += x * y; - }; +#pragma unroll + for (int j = 0; j < (raft::WarpSize/P::AccThCols); j++) { + if (lid == 0) { +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto rid = gridStrideY + accrowid + j + i * P::AccThRows ; + if (rid < m) { + auto value = val[i]; + while (atomicCAS(mutex + rid, 0, 1) == 1) + ; + __threadfence(); + red_op(rid, min + rid, value); + __threadfence(); + atomicCAS(mutex + rid, 1, 0); + } + } + } + if (j < (raft::WarpSize/P::AccThCols) - 1) { +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); + val[i] = {tmpkey, tmpvalue}; + } + } + } +} + +template +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( + OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, + IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, + KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { + extern __shared__ char smem[]; + + typedef cub::KeyValuePair KVPair; + KVPair val[P::AccRowsPerTh]; +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } - int *mutex = workspace; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt, min, mutex, m, n, redOp, pairRedOp] __device__( + auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn) { - extern __shared__ char smem[]; - KVPair *sRed = (KVPair*)smem; - - ReduceOpT red_op(redOp); + DataT * regxn, DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); #pragma unroll @@ -114,7 +144,7 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; } } - if (sqrt) { + if (Sqrt) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll @@ -124,61 +154,81 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, } } - // reduce - KVPair val[P::AccRowsPerTh]; - auto lid = raft::laneId(); + // intra thread reduce const auto acccolid = threadIdx.x % P::AccThCols; const auto accrowid = threadIdx.x / P::AccThCols; #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { - auto tmpkey = acccolid + j * P::AccThCols + blockIdx.x * P::Nblk; + auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; KVPair tmp = {tmpkey, acc[i][j]}; - if (tmpkey < n) - val[i] = - pairRed_op(accrowid + i * P::AccThRows + blockIdx.y * P::Mblk, - tmp, val[i]); + if (tmpkey < n){ + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + } } + } + }; + + auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, maxVal] + __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); + + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); + + // reduce +#pragma unroll + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll for (int j = P::AccThCols / 2; j > 0; j >>= 1) { auto tmpkey = raft::shfl(val[i].key, lid + j); auto tmpvalue = raft::shfl(val[i].value, lid + j); KVPair tmp = {tmpkey, tmpvalue}; - val[i] = - pairRed_op(accrowid + i * P::AccThRows + blockIdx.y * P::Mblk, - tmp, val[i]); + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } - __syncthreads(); - if (lid % P::AccThCols == 0) { + + updateReducedVal(mutex, min, val, + red_op, m, gridStrideY); + + // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - sRed[i * P::AccThCols + accrowid] = val[i]; - } - } - __syncthreads(); - // for now have first lane from each warp update a unique output row. This - // will resolve hang issues with pre-Volta architectures - auto nWarps = blockDim.x / raft::WarpSize; - auto ridx = IdxT(blockIdx.y) * P::Mblk; - if (lid == 0) { - for (int i = threadIdx.x / raft::WarpSize; i < P::Mblk; i += nWarps) { - auto rid = ridx + i; - if (rid < m) { - auto val = sRed[i]; - while (atomicCAS(mutex + rid, 0, 1) == 1) - ; - __threadfence(); - red_op(rid, min + rid, val); - __threadfence(); - atomicCAS(mutex + rid, 1, 0); - } - } + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; } }; + IdxT lda = k, ldb = k, ldd = n; + PairwiseDistances + obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, + epilog_lambda, fin_op, rowEpilog_lambda); + obj.run(); +} + +template +void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, + const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, + ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, + bool initOutBuffer, cudaStream_t stream) { + typedef typename linalg::Policy4x4::Policy P; + + dim3 grid = launchConfigGenerator(m, n); + dim3 blk(P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); + constexpr auto maxVal = std::numeric_limits::max(); + typedef cub::KeyValuePair KVPair; + + // Accumulation operation lambda + auto core_lambda = [] __device__(DataT &acc, DataT & x, DataT & y) { + acc += x * y; + }; + CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { initKernel @@ -186,18 +236,21 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, CUDA_CHECK(cudaGetLastError()); } - IdxT lda = k, ldb = k, ldd = n; - auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, - ldb, ldd, nullptr, core_lambda, - epilog_lambda, fin_op); + if (sqrt) { + fusedL2NNkernel<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, + core_lambda, fin_op); + } else { + fusedL2NNkernel<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, + core_lambda, fin_op); + } CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index b6439da5a9..e25eba2345 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -65,7 +65,8 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, // epilogue operation lambda for final value calculation auto epilog_lambda = [] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn) { return; }; + DataT * regxn, DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { return; }; if (isRowMajor) { pairwiseDistanceMatKernel> @@ -72,6 +74,8 @@ struct PairwiseDistances : public BaseClass { CoreLambda core_op; EpilogueLambda epilog_op; FinalLambda fin_op; + rowEpilogueLambda rowEpilog_op; + AccT acc[P::AccRowsPerTh][P::AccColsPerTh]; public: @@ -80,7 +84,8 @@ struct PairwiseDistances : public BaseClass { IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, const DataT* _xn, const DataT* _yn, OutT* _dOutput, char* _smem, CoreLambda _core_op, - EpilogueLambda _epilog_op, FinalLambda _fin_op) + EpilogueLambda _epilog_op, FinalLambda _fin_op, + rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), yn(_yn), @@ -89,7 +94,8 @@ struct PairwiseDistances : public BaseClass { smem(_smem), core_op(_core_op), epilog_op(_epilog_op), - fin_op(_fin_op) {} + fin_op(_fin_op), + rowEpilog_op(_rowEpilog_op) {} DI void run() { @@ -101,6 +107,7 @@ struct PairwiseDistances : public BaseClass { loop(); epilog(gridStrideX, gridStrideY); } + rowEpilog_op(gridStrideY); } } @@ -212,9 +219,9 @@ struct PairwiseDistances : public BaseClass { regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)]; } - epilog_op(acc, regxn, regyn); + epilog_op(acc, regxn, regyn, gridStrideX, gridStrideY); } else { - epilog_op(acc, nullptr, nullptr); + epilog_op(acc, nullptr, nullptr, gridStrideX, gridStrideY); } if (writeOut) { @@ -266,10 +273,12 @@ struct PairwiseDistances : public BaseClass { * @param epilog_op the epilogue lambda * @param fin_op the final gemm epilogue lambda */ + template + typename EpilogueLambda, + typename FinalLambda, + bool isRowMajor = true, bool writeOut = true> __global__ __launch_bounds__( Policy::Nthreads, 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, @@ -279,11 +288,13 @@ __global__ __launch_bounds__( EpilogueLambda epilog_op, FinalLambda fin_op) { extern __shared__ char smem[]; + auto rowEpilog = [] __device__ (IdxT starty) { return; }; PairwiseDistances + EpilogueLambda, FinalLambda, decltype(rowEpilog), + isRowMajor, writeOut> obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, - epilog_op, fin_op); + epilog_op, fin_op, rowEpilog); obj.run(); } @@ -297,7 +308,14 @@ dim3 launchConfigGenerator(IdxT m, IdxT n) { int xChunks = raft::ceildiv(n, P::Nblk); grid.y = yChunks > minGridSize ? minGridSize : yChunks; grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; - grid.x = grid.x > (minGridSize + 1 - grid.y) ? (minGridSize + 1 - grid.y) : grid.x; + if (grid.x != 1) { + int i = 1; + while(grid.y * i < minGridSize) { + i++; + } + grid.x = i >= xChunks ? xChunks : i; + } + return grid; } From 9c71c4ac753bb4852fb1f21500aba9bfac931404 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 19 May 2021 19:29:37 +0530 Subject: [PATCH 04/17] Add note on reason to use thread 0 from each warp to write final reduced val for pre-volta arch --- cpp/include/raft/distance/fused_l2_nn.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index 2d91268d52..8a3ac19a0e 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -86,6 +86,8 @@ DI void updateReducedVal(int *mutex, OutT *min, KVPair *val, const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; + // for now have first lane from each warp update a unique output row. This + // will resolve hang issues with pre-Volta architectures #pragma unroll for (int j = 0; j < (raft::WarpSize/P::AccThCols); j++) { if (lid == 0) { From 4d76b57676c2de8098065d9e98b1ff347e67c7dc Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 19 May 2021 20:41:15 +0530 Subject: [PATCH 05/17] fix clangformat and copyright year --- cpp/include/raft/distance/cosine.cuh | 4 +- cpp/include/raft/distance/euclidean.cuh | 8 +-- cpp/include/raft/distance/fused_l2_nn.cuh | 67 ++++++++++--------- cpp/include/raft/distance/l1.cuh | 4 +- .../raft/distance/pairwise_distance_base.cuh | 25 +++---- 5 files changed, 53 insertions(+), 55 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index 6f776591b6..cc5bffd1f4 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -72,8 +72,8 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, // epilogue operation lambda for final value calculation auto epilog_lambda = [] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, - IdxT gridStrideX, IdxT gridStrideY) { + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 3f2aabb0fd..bdf2d262d5 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -71,8 +71,8 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, // epilogue operation lambda for final value calculation auto epilog_lambda = [sqrt] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, - IdxT gridStrideX, IdxT gridStrideY) { + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -242,8 +242,8 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, // epilogue operation lambda for final value calculation auto epilog_lambda = [sqrt] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, - IdxT gridStrideX, IdxT gridStrideY) { + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index 8a3ac19a0e..da40f896e0 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include namespace raft { namespace distance { @@ -80,20 +80,20 @@ __global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls template -DI void updateReducedVal(int *mutex, OutT *min, KVPair *val, - ReduceOpT red_op, IdxT m, IdxT gridStrideY) { + typename ReduceOpT> +DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, + IdxT m, IdxT gridStrideY) { const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This // will resolve hang issues with pre-Volta architectures #pragma unroll - for (int j = 0; j < (raft::WarpSize/P::AccThCols); j++) { + for (int j = 0; j < (raft::WarpSize / P::AccThCols); j++) { if (lid == 0) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto rid = gridStrideY + accrowid + j + i * P::AccThRows ; + auto rid = gridStrideY + accrowid + j + i * P::AccThRows; if (rid < m) { auto value = val[i]; while (atomicCAS(mutex + rid, 0, 1) == 1) @@ -105,7 +105,7 @@ DI void updateReducedVal(int *mutex, OutT *min, KVPair *val, } } } - if (j < (raft::WarpSize/P::AccThCols) - 1) { + if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); @@ -116,9 +116,9 @@ DI void updateReducedVal(int *mutex, OutT *min, KVPair *val, } } -template +template __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, @@ -135,8 +135,8 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn, - IdxT gridStrideX, IdxT gridStrideY) { + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); #pragma unroll @@ -163,17 +163,18 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { - auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; + auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; KVPair tmp = {tmpkey, acc[i][j]}; - if (tmpkey < n){ - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + if (tmpkey < n) { + val[i] = + pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, maxVal] - __device__(IdxT gridStrideY) { + auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, + maxVal] __device__(IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); ReduceOpT red_op(redOp); @@ -188,12 +189,13 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( auto tmpkey = raft::shfl(val[i].key, lid + j); auto tmpvalue = raft::shfl(val[i].value, lid + j); KVPair tmp = {tmpkey, tmpvalue}; - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = + pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } - updateReducedVal(mutex, min, val, - red_op, m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, + m, gridStrideY); // reset the val array. #pragma unroll @@ -204,9 +206,8 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( IdxT lda = k, ldb = k, ldd = n; PairwiseDistances + decltype(epilog_lambda), FinalLambda, + decltype(rowEpilog_lambda), true, false> obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, epilog_lambda, fin_op, rowEpilog_lambda); obj.run(); @@ -227,7 +228,7 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT &acc, DataT & x, DataT & y) { + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; @@ -238,20 +239,20 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, CUDA_CHECK(cudaGetLastError()); } - auto fin_op = [] __device__(DataT d_val, int g_d_idx) { - return d_val; - }; + auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; if (sqrt) { fusedL2NNkernel<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + decltype(core_lambda), decltype(fin_op)> + <<>>(min, x, y, xn, yn, m, n, k, maxVal, + workspace, redOp, pairRedOp, + core_lambda, fin_op); } else { fusedL2NNkernel<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + decltype(core_lambda), decltype(fin_op)> + <<>>(min, x, y, xn, yn, m, n, k, maxVal, + workspace, redOp, pairRedOp, + core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index e25eba2345..734a17606d 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -65,8 +65,8 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, // epilogue operation lambda for final value calculation auto epilog_lambda = [] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, - IdxT gridStrideX, IdxT gridStrideY) { return; }; + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { pairwiseDistanceMatKernel -#include #include #include +#include +#include namespace raft { namespace distance { @@ -68,7 +68,7 @@ struct PairwiseDistances : public BaseClass { typedef Policy P; const DataT* xn; const DataT* yn; - const DataT *const yBase; + const DataT* const yBase; OutT* dOutput; char* smem; CoreLambda core_op; @@ -98,11 +98,10 @@ struct PairwiseDistances : public BaseClass { rowEpilog_op(_rowEpilog_op) {} DI void run() { - - for(auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; - gridStrideY += P::Mblk * gridDim.y) { + for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; + gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; - gridStrideX += P::Nblk * gridDim.x) { + gridStrideX += P::Nblk * gridDim.x) { prolog(gridStrideX, gridStrideY); loop(); epilog(gridStrideX, gridStrideY); @@ -276,9 +275,8 @@ struct PairwiseDistances : public BaseClass { template + typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true, + bool writeOut = true> __global__ __launch_bounds__( Policy::Nthreads, 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, @@ -288,7 +286,7 @@ __global__ __launch_bounds__( EpilogueLambda epilog_op, FinalLambda fin_op) { extern __shared__ char smem[]; - auto rowEpilog = [] __device__ (IdxT starty) { return; }; + auto rowEpilog = [] __device__(IdxT starty) { return; }; PairwiseDistances +template dim3 launchConfigGenerator(IdxT m, IdxT n) { const auto numSMs = raft::getMultiProcessorCount(); // multiply by 2 as per launch bounds for pairwise dist kernels. @@ -310,7 +308,7 @@ dim3 launchConfigGenerator(IdxT m, IdxT n) { grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { int i = 1; - while(grid.y * i < minGridSize) { + while (grid.y * i < minGridSize) { i++; } grid.x = i >= xChunks ? xChunks : i; @@ -319,6 +317,5 @@ dim3 launchConfigGenerator(IdxT m, IdxT n) { return grid; } - }; // namespace distance }; // namespace raft \ No newline at end of file From 4ada29e3144f9bab9f67710ab0b23c2fc1c62f21 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Thu, 20 May 2021 17:07:48 +0530 Subject: [PATCH 06/17] --Add additional Mblk + Nblk shmem for storing norms, and reuse xNorm for subsequent gridStrideX variations. this overall improves perf of fusedL2NN to 1.85x over previous version. --Also remove checking keys only check values in fusedL2nn test case, as it may happen a row has multiple keys with same min val --- cpp/include/raft/distance/cosine.cuh | 14 ++++++++------ cpp/include/raft/distance/euclidean.cuh | 14 ++++++++------ cpp/include/raft/distance/fused_l2_nn.cuh | 13 +++++++------ .../raft/distance/pairwise_distance_base.cuh | 12 ++++++------ cpp/test/distance/fused_l2_nn.cu | 2 -- 5 files changed, 29 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index cc5bffd1f4..94d8fa07b5 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -83,20 +83,22 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, } }; + size_t shmemSize = + KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, - ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, + dOutput, core_lambda, epilog_lambda, + fin_op); } else { pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, - ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, + dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index bdf2d262d5..70526a8797 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -91,20 +91,22 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, } }; + size_t shmemSize = + KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, - ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, + dOutput, core_lambda, epilog_lambda, + fin_op); } else { pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, - ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, + dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index da40f896e0..d85540497b 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -241,18 +241,19 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; + size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { fusedL2NNkernel - <<>>(min, x, y, xn, yn, m, n, k, maxVal, - workspace, redOp, pairRedOp, - core_lambda, fin_op); + <<>>(min, x, y, xn, yn, m, n, k, maxVal, + workspace, redOp, pairRedOp, + core_lambda, fin_op); } else { fusedL2NNkernel - <<>>(min, x, y, xn, yn, m, n, k, maxVal, - workspace, redOp, pairRedOp, - core_lambda, fin_op); + <<>>(min, x, y, xn, yn, m, n, k, maxVal, + workspace, redOp, pairRedOp, + core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 605fb9e428..3c6431a999 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -190,15 +190,15 @@ struct PairwiseDistances : public BaseClass { DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { if (useNorms) { - __syncthreads(); // so that we can safely reuse smem - - DataT* sxNorm = (DataT*)smem; + DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); DataT* syNorm = (&sxNorm[P::Mblk]); // Load x & y norms required by this threadblock in shmem buffer - for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; - sxNorm[i] = idx < this->m ? xn[idx] : 0; + if (gridStrideX == blockIdx.x * P::Nblk) { + for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { + auto idx = gridStrideY + i; + sxNorm[i] = idx < this->m ? xn[idx] : 0; + } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index d4e39a0b5e..4573a070b6 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -164,7 +164,6 @@ struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} bool operator()(const KVP &a, const KVP &b) const { - if (a.key != b.key) return false; T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; @@ -179,7 +178,6 @@ template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; bool operator()(const KVP &a, const KVP &b) const { - if (a.key != b.key) return false; if (a.value != b.value) return false; return true; } From 2e804c2c099eb63517e2c506b997301d6f9d5280 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Mon, 24 May 2021 18:57:05 +0530 Subject: [PATCH 07/17] Use cudaOccupancyMaxActiveBlocksPerSM instead of hard-coded launch bound in launchConfigGenerator. --Use constexpr in shmemSize. --- cpp/include/raft/distance/cosine.cuh | 31 +++++---- cpp/include/raft/distance/euclidean.cuh | 68 +++++++++++-------- cpp/include/raft/distance/fused_l2_nn.cuh | 29 ++++---- cpp/include/raft/distance/l1.cuh | 32 +++++---- .../raft/distance/pairwise_distance_base.cuh | 11 +-- 5 files changed, 101 insertions(+), 70 deletions(-) diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index 94d8fa07b5..ed9bd28b7f 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -61,7 +61,6 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -83,22 +82,26 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, } }; - size_t shmemSize = + constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, - dOutput, core_lambda, epilog_lambda, - fin_op); + auto cosineRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + cosineRowMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } else { - pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, - dOutput, core_lambda, epilog_lambda, - fin_op); + auto cosineColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + cosineColMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 70526a8797..484da0e5bf 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -60,7 +60,6 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, typedef typename std::conditional::type KPolicy; - dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -91,22 +90,29 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, } }; - size_t shmemSize = + constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, - dOutput, core_lambda, epilog_lambda, - fin_op); + auto euclideanExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + + euclideanExpRowMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } else { - pairwiseDistanceMatKernel - <<>>(x, y, xn, yn, m, n, k, lda, ldb, ldd, - dOutput, core_lambda, epilog_lambda, - fin_op); + auto euclideanExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + euclideanExpColMajor<<>>( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); @@ -232,7 +238,6 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -258,19 +263,28 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - pairwiseDistanceMatKernel - <<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + auto euclideanUnExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + euclideanUnExpRowMajor); + + euclideanUnExpRowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); + } else { - pairwiseDistanceMatKernel - <<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + auto euclideanUnExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + euclideanUnExpColMajor); + + euclideanUnExpColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index d85540497b..b96a536e38 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -221,7 +221,6 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, bool initOutBuffer, cudaStream_t stream) { typedef typename linalg::Policy4x4::Policy P; - dim3 grid = launchConfigGenerator(m, n); dim3 blk(P::Nthreads); auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); @@ -241,19 +240,25 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = + P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - fusedL2NNkernel - <<>>(min, x, y, xn, yn, m, n, k, maxVal, - workspace, redOp, pairRedOp, - core_lambda, fin_op); + auto fusedL2NNSqrt = + fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + + fusedL2NNSqrt<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, + core_lambda, fin_op); } else { - fusedL2NNkernel - <<>>(min, x, y, xn, yn, m, n, k, maxVal, - workspace, redOp, pairRedOp, - core_lambda, fin_op); + auto fusedL2NN = + fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, + maxVal, workspace, redOp, + pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index 734a17606d..6ab084f041 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -53,7 +53,6 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef typename std::conditional::type KPolicy; - dim3 grid = launchConfigGenerator(m, n); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda @@ -69,19 +68,26 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, IdxT gridStrideY) { return; }; if (isRowMajor) { - pairwiseDistanceMatKernel - <<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + auto l1RowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + + l1RowMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - pairwiseDistanceMatKernel - <<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + auto l1ColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + l1ColMajor<<>>( + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 3c6431a999..aa8c93a0b9 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -296,12 +296,15 @@ __global__ __launch_bounds__( obj.run(); } -template -dim3 launchConfigGenerator(IdxT m, IdxT n) { +template +dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { const auto numSMs = raft::getMultiProcessorCount(); - // multiply by 2 as per launch bounds for pairwise dist kernels. - int minGridSize = numSMs * 2; + int numBlocksPerSm = 0; dim3 grid; + + CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, func, P::Nthreads, sMemSize)); + int minGridSize = numSMs * numBlocksPerSm; int yChunks = raft::ceildiv(m, P::Mblk); int xChunks = raft::ceildiv(n, P::Nblk); grid.y = yChunks > minGridSize ? minGridSize : yChunks; From 69b316df52d337d6bd2480ba7544e298a5a22565 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Tue, 1 Jun 2021 14:27:23 +0530 Subject: [PATCH 08/17] initialize regx and regy during each prolog call --- cpp/include/raft/distance/pairwise_distance_base.cuh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index aa8c93a0b9..25d499fa88 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -18,6 +18,7 @@ #include #include #include +#include namespace raft { namespace distance { @@ -147,13 +148,22 @@ struct PairwiseDistances : public BaseClass { } this->ldgXY(0); + typedef TxN_t VecType; + VecType zeros; + zeros.fill(BaseClass::Zero); #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { + zeros.store(&(this->regx[i][0]), 0); #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { acc[i][j] = BaseClass::Zero; } } +#pragma unroll + for (int j = 0; j < P::AccColsPerTh; ++j) { + zeros.store(&(this->regy[j][0]), 0); + } + this->stsXY(); __syncthreads(); this->pageWr ^= 1; From 9a30a872b63a807faafce2acf0faca123c2ebbbc Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Tue, 1 Jun 2021 21:25:21 +0530 Subject: [PATCH 09/17] initialize ldgX, ldgY in prolog --- .../raft/distance/pairwise_distance_base.cuh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 25d499fa88..503397bac9 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -147,13 +147,23 @@ struct PairwiseDistances : public BaseClass { updateIndicesXY(); } - this->ldgXY(0); typedef TxN_t VecType; VecType zeros; zeros.fill(BaseClass::Zero); +#pragma unroll + for (int j = 0; j < P::LdgPerThX; ++j) { + zeros.store(&this->ldgDataX[j][0], 0); + } +#pragma unroll + for (int j = 0; j < P::LdgPerThY; ++j) { + zeros.store(&this->ldgDataY[j][0], 0); + } + + this->ldgXY(0); + #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - zeros.store(&(this->regx[i][0]), 0); + zeros.store(&this->regx[i][0], 0); #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { acc[i][j] = BaseClass::Zero; @@ -161,7 +171,7 @@ struct PairwiseDistances : public BaseClass { } #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { - zeros.store(&(this->regy[j][0]), 0); + zeros.store(&this->regy[j][0], 0); } this->stsXY(); From 9c4d5a0ecf6710540283a78e4b1bd5fa57e9572e Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Thu, 3 Jun 2021 20:12:00 +0530 Subject: [PATCH 10/17] add syncthreads post epilog calc for non-norm distance metrics to make sure next grid stride doesn't pollute shmem before completion of this calculation --- cpp/include/raft/distance/pairwise_distance_base.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 503397bac9..69e0ad0bf1 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -241,6 +241,10 @@ struct PairwiseDistances : public BaseClass { epilog_op(acc, regxn, regyn, gridStrideX, gridStrideY); } else { epilog_op(acc, nullptr, nullptr, gridStrideX, gridStrideY); + // This sync is needed for making sure next grid stride of + // non-norm based metrics doesn't make shmem dirty until current + // this iteration is complete. + __syncthreads(); } if (writeOut) { From 4fb00e6b040578805d477986dbf8073feded9598 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Fri, 4 Jun 2021 12:12:43 +0530 Subject: [PATCH 11/17] remove syncthreads in epilog and instead use ping-pong buffers in next iteration of grid stride --- .../raft/distance/pairwise_distance_base.cuh | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 69e0ad0bf1..5d216120dc 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -120,8 +120,11 @@ struct PairwiseDistances : public BaseClass { this->y += stride; } this->yrowid += stride; - this->pageWr = 0; - this->pageRd = 0; + // This is needed for making sure next grid stride of + // non-norm based metrics uses previously accumulated buffer so + // it doesn't make shmem dirty until previous iteration + // is complete. + this->pageRd = this->pageWr; } DI void updateIndicesXY() { @@ -136,8 +139,7 @@ struct PairwiseDistances : public BaseClass { this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; - this->pageWr = 0; - this->pageRd = 0; + this->pageRd = this->pageWr; } DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { @@ -241,10 +243,6 @@ struct PairwiseDistances : public BaseClass { epilog_op(acc, regxn, regyn, gridStrideX, gridStrideY); } else { epilog_op(acc, nullptr, nullptr, gridStrideX, gridStrideY); - // This sync is needed for making sure next grid stride of - // non-norm based metrics doesn't make shmem dirty until current - // this iteration is complete. - __syncthreads(); } if (writeOut) { From b5b3c51298c614a650d8b9be8a7694a669fec353 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Fri, 4 Jun 2021 21:17:43 +0530 Subject: [PATCH 12/17] use ping-pong buffers for safely grid striding --- cpp/include/raft/distance/pairwise_distance_base.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 5d216120dc..d5a434f2fa 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -120,11 +120,6 @@ struct PairwiseDistances : public BaseClass { this->y += stride; } this->yrowid += stride; - // This is needed for making sure next grid stride of - // non-norm based metrics uses previously accumulated buffer so - // it doesn't make shmem dirty until previous iteration - // is complete. - this->pageRd = this->pageWr; } DI void updateIndicesXY() { @@ -139,7 +134,6 @@ struct PairwiseDistances : public BaseClass { this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; - this->pageRd = this->pageWr; } DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { @@ -191,6 +185,11 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } accumulate(); // last iteration + // This is needed for making sure next grid stride of + // non-norm based metrics uses previously accumulated buffer so + // it doesn't make shmem dirty until previous iteration + // is complete. + this->pageRd ^= 1; } DI void accumulate() { From 0f2c03d17d507bd091be4a7dc560c9690ac8dece Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Mon, 7 Jun 2021 18:42:30 +0530 Subject: [PATCH 13/17] fix build failure of mst and knn test by adding cuda stream arg to rmm::device_buffer --- cpp/test/mst.cu | 18 +++++++++--------- cpp/test/spatial/knn.cu | 6 ++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 215c6f6548..b0007c73fd 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -198,15 +198,15 @@ class MSTTest MSTTestInput>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = - rmm::device_buffer(mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t)); - csr_d.indices = - rmm::device_buffer(mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t)); - csr_d.weights = - rmm::device_buffer(mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t)); + csr_d.offsets = rmm::device_buffer( + mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); + csr_d.indices = rmm::device_buffer( + mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); + csr_d.weights = rmm::device_buffer( + mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); } void TearDown() override {} diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index cfd4ecc9d1..2b1ef89f7a 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -107,11 +107,13 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float)); + row_major_input.data(), row_major_input.size() * sizeof(float), + handle_.get_stream()); float *input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int)); + params_.labels.data(), params_.labels.size() * sizeof(int), + handle_.get_stream()); int *labels_ptr = static_cast(labels_d.data()); raft::allocate(input_, rows_ * cols_, true); From 484b0820f0cc3ce3c33b1a0ebb90a505513aa7ea Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Mon, 7 Jun 2021 18:59:34 +0530 Subject: [PATCH 14/17] temp commit for test rerun --- cpp/test/mst.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index b0007c73fd..d7aa76500b 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -58,7 +58,7 @@ weight_t prims(CSRHost &csr_h) { auto n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; - // bool mst_set[csr_h.n_edges]; + // bool mst_set[csr_h.n_edges]; weight_t curr_edge[n_vertices]; for (auto i = 0; i < n_vertices; i++) { From 04f656f97ee82b75046501615fa2927781880a6c Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Mon, 7 Jun 2021 19:14:01 +0530 Subject: [PATCH 15/17] use ucx-py version 0.21 to temp resolve ci build failures --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index b4eef5ebd0..1707b72f44 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -51,7 +51,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid "rmm=${MINOR_VERSION}" \ "dask-cudf=${MINOR_VERSION}" \ "dask-cuda=${MINOR_VERSION}" \ - "ucx-py=${MINOR_VERSION}" \ + "ucx-py=0.21" \ "rapids-build-env=${MINOR_VERSION}.*" \ "rapids-notebook-env=${MINOR_VERSION}.*" \ "rapids-doc-env=${MINOR_VERSION}.*" From 4f5b2cf4fcf5eb090c8e687405a656436df6a9e0 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Wed, 9 Jun 2021 14:03:02 +0530 Subject: [PATCH 16/17] Add lds and sts inline ptx instructions to force vector instruction generation --- .../raft/common/device_loads_stores.cuh | 92 ++++++++++++------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index 7f9462d9ea..e87aa2debf 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -24,27 +24,43 @@ namespace raft { * @defgroup SmemStores Shared memory store operations * @{ * @brief Stores to shared memory (both vectorized and non-vectorized forms) - * @param[out] addr shared memory address + * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) { *addr = x; } -DI void sts(float* addr, const float (&x)[1]) { *addr = x[0]; } +DI void sts(float* addr, const float& x) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); +} +DI void sts(float* addr, const float (&x)[1]) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); +} DI void sts(float* addr, const float (&x)[2]) { - float2 v2 = make_float2(x[0], x[1]); - auto* s2 = reinterpret_cast(addr); - *s2 = v2; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" + : + : "l"(s2), "f"(x[0]), "f"(x[1])); } DI void sts(float* addr, const float (&x)[4]) { - float4 v4 = make_float4(x[0], x[1], x[2], x[3]); - auto* s4 = reinterpret_cast(addr); - *s4 = v4; + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" + : + : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); +} + +DI void sts(double* addr, const double& x) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); +} +DI void sts(double* addr, const double (&x)[1]) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double& x) { *addr = x; } -DI void sts(double* addr, const double (&x)[1]) { *addr = x[0]; } DI void sts(double* addr, const double (&x)[2]) { - double2 v2 = make_double2(x[0], x[1]); - auto* s2 = reinterpret_cast(addr); - *s2 = v2; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" + : + : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -55,29 +71,39 @@ DI void sts(double* addr, const double (&x)[2]) { * @param[out] x the data to be loaded * @param[in] addr shared memory address from where to load */ -DI void lds(float& x, float* addr) { x = *addr; } -DI void lds(float (&x)[1], float* addr) { x[0] = *addr; } +DI void lds(float& x, float* addr) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); +} +DI void lds(float (&x)[1], float* addr) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); +} DI void lds(float (&x)[2], float* addr) { - auto* s2 = reinterpret_cast(addr); - auto v2 = *s2; - x[0] = v2.x; - x[1] = v2.y; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(x[0]), "=f"(x[1]) + : "l"(s2)); } DI void lds(float (&x)[4], float* addr) { - auto* s4 = reinterpret_cast(addr); - auto v4 = *s4; - x[0] = v4.x; - x[1] = v4.y; - x[2] = v4.z; - x[3] = v4.w; -} -DI void lds(double& x, double* addr) { x = *addr; } -DI void lds(double (&x)[1], double* addr) { x[0] = *addr; } + auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" + : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) + : "l"(s4)); +} +DI void lds(double& x, double* addr) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); +} +DI void lds(double (&x)[1], double* addr) { + auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); +} DI void lds(double (&x)[2], double* addr) { - auto* s2 = reinterpret_cast(addr); - auto v2 = *s2; - x[0] = v2.x; - x[1] = v2.y; + auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(x[0]), "=d"(x[1]) + : "l"(s2)); } /** @} */ From 923e9a114df90b02f275131e0a2a33629c587e92 Mon Sep 17 00:00:00 2001 From: Mahesh Doijade Date: Fri, 11 Jun 2021 21:48:28 +0530 Subject: [PATCH 17/17] document the shmem ptr alignment requirement for vector load/stores --- cpp/include/raft/common/device_loads_stores.cuh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index e87aa2debf..bb2b019ecb 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -24,6 +24,10 @@ namespace raft { * @defgroup SmemStores Shared memory store operations * @{ * @brief Stores to shared memory (both vectorized and non-vectorized forms) + * requires the given shmem pointer to be aligned by the vector + length, like for float4 lds/sts shmem pointer should be aligned + by 16 bytes else it might silently fail or can also give + runtime error. * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ @@ -68,8 +72,13 @@ DI void sts(double* addr, const double (&x)[2]) { * @defgroup SmemLoads Shared memory load operations * @{ * @brief Loads from shared memory (both vectorized and non-vectorized forms) + requires the given shmem pointer to be aligned by the vector + length, like for float4 lds/sts shmem pointer should be aligned + by 16 bytes else it might silently fail or can also give + runtime error. * @param[out] x the data to be loaded * @param[in] addr shared memory address from where to load + * (should be aligned to vector size) */ DI void lds(float& x, float* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr));