Skip to content

Commit

Permalink
make raft sources compilable with clang (#424)
Browse files Browse the repository at this point in the history
This makes RAFT sources compilable with clang.
It fixes some fragile code (using `static const` instead of `static constexpr` or `%laneid` in PTX relying on quirks in nvcc which make this happen).

RAFT is still not compilable with clang entirely though due to the dependencies:
1. cub has this issue before 1.14: NVIDIA/cub#335
2. libcudacxx has issues with atomic, which should be fixed in >= 1.7.0-ea (wasn't able to verify this yet)
3. libcudacxx has issues with variadic CUDA functions, which is apparently fixed by passing `-Xclang -fcuda-allow-variadic-functions` to clang (wasn't able to verify this yet)
3. cooperative_groups from CUDA does not work with clang 11.0 / 11.1 but only with >= 13

EDIT: this is necessary to close #84

Authors:
  - Matt Joux (https://github.com/MatthiasKohl)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Artem M. Chirkin (https://github.com/achirkin)

URL: #424
  • Loading branch information
MatthiasKohl authored Feb 16, 2022
1 parent ad3af3e commit b2a88c2
Show file tree
Hide file tree
Showing 23 changed files with 824 additions and 72 deletions.
41 changes: 21 additions & 20 deletions cpp/include/raft/cluster/detail/kmeans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <ctime>
Expand All @@ -28,6 +29,7 @@
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include <raft/cuda_utils.cuh>
#include <raft/cudart_utils.h>
#include <raft/device_atomics.cuh>
#include <raft/handle.hpp>
Expand Down Expand Up @@ -404,8 +406,8 @@ static int chooseNewCentroid(handle_t const& handle,
//}

RAFT_CHECK_CUDA(stream);
obsIndex = max(obsIndex, 0);
obsIndex = min(obsIndex, n - 1);
obsIndex = std::max(obsIndex, static_cast<index_type_t>(0));
obsIndex = std::min(obsIndex, n - 1);

// Record new centroid position
RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
Expand Down Expand Up @@ -467,7 +469,7 @@ static int initializeCentroids(handle_t const& handle,
auto stream = handle.get_stream();
auto thrust_exec_policy = handle.get_thrust_policy();

constexpr index_type_t grid_lower_bound{65535};
constexpr unsigned grid_lower_bound{65535};

// -------------------------------------------------------
// Implementation
Expand All @@ -477,12 +479,12 @@ static int initializeCentroids(handle_t const& handle,
dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};

// CUDA grid dimensions
dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
dim3 gridDim_warp{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
1,
min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound)};

// CUDA grid dimensions
dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
dim3 gridDim_block{std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound), 1, 1};

// Assign observation vectors to code 0
RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
Expand Down Expand Up @@ -574,10 +576,10 @@ static int assignCentroids(handle_t const& handle,
dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};

dim3 gridDim;
constexpr index_type_t grid_lower_bound{65535};
gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
gridDim.y = min(k, grid_lower_bound);
gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
constexpr unsigned grid_lower_bound{65535};
gridDim.x = std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound);
gridDim.y = std::min(static_cast<unsigned>(k), grid_lower_bound);
gridDim.z = std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound);

computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
RAFT_CHECK_CUDA(stream);
Expand All @@ -587,7 +589,7 @@ static int assignCentroids(handle_t const& handle,
blockDim.x = BLOCK_SIZE;
blockDim.y = 1;
blockDim.z = 1;
gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
gridDim.x = std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound);
gridDim.y = 1;
gridDim.z = 1;
minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
Expand Down Expand Up @@ -644,7 +646,7 @@ static int updateCentroids(handle_t const& handle,
const value_type_t one = 1;
const value_type_t zero = 0;

constexpr index_type_t grid_lower_bound{65535};
constexpr unsigned grid_lower_bound{65535};

auto stream = handle.get_stream();
auto cublas_h = handle.get_cublas_handle();
Expand Down Expand Up @@ -717,8 +719,8 @@ static int updateCentroids(handle_t const& handle,
dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};

// CUDA grid dimensions
dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
std::min(ceildiv<unsigned>(k, BSIZE_DIV_WSIZE), grid_lower_bound),
1};

divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
Expand Down Expand Up @@ -791,7 +793,7 @@ int kmeans(handle_t const& handle,
// Current iteration
index_type_t iter;

constexpr index_type_t grid_lower_bound{65535};
constexpr unsigned grid_lower_bound{65535};

// Residual sum of squares at previous iteration
value_type_t residualPrev = 0;
Expand All @@ -818,10 +820,9 @@ int kmeans(handle_t const& handle,

dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};

dim3 gridDim{
min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
1,
min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
1,
std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};

CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
Expand Down Expand Up @@ -958,7 +959,7 @@ int kmeans(handle_t const& handle,
// Allocate memory
raft::spectral::matrix::vector_t<index_type_t> clusterSizes(handle, k);
raft::spectral::matrix::vector_t<value_type_t> centroids(handle, d * k);
raft::spectral::matrix::vector_t<value_type_t> work(handle, n * max(k, d));
raft::spectral::matrix::vector_t<value_type_t> work(handle, n * std::max(k, d));
raft::spectral::matrix::vector_t<index_type_t> work_int(handle, 2 * d * n);

// Perform k-means
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/raft/cuda_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2020, NVIDIA CORPORATION.
* Copyright (c) 2018-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -109,7 +109,7 @@ static const int WarpSize = 32;
DI int laneId()
{
int id;
asm("mov.s32 %0, %laneid;" : "=r"(id));
asm("mov.s32 %0, %%laneid;" : "=r"(id));
return id;
}

Expand Down Expand Up @@ -228,13 +228,13 @@ DI T myAtomicMax(T* address, T val);

DI float myAtomicMin(float* address, float val)
{
myAtomicReduce(address, val, fminf);
myAtomicReduce<float(float, float)>(address, val, fminf);
return *address;
}

DI float myAtomicMax(float* address, float val)
{
myAtomicReduce(address, val, fmaxf);
myAtomicReduce<float(float, float)>(address, val, fmaxf);
return *address;
}

Expand Down
4 changes: 3 additions & 1 deletion cpp/include/raft/label/detail/classlabels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <algorithm>

namespace raft {
namespace label {
namespace detail {
Expand Down Expand Up @@ -56,7 +58,7 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
NULL, bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
cub::DeviceSelect::Unique(
NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n, stream);
bytes = max(bytes, bytes2);
bytes = std::max(bytes, bytes2);
rmm::device_uvector<char> cub_storage(bytes, stream);

// Select Unique classes
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/contractions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct Contractions_NT {
/** block of Y data loaded from global mem after `ldgXY()` */
DataT ldgDataY[P::LdgPerThY][P::Veclen];

static const DataT Zero = (DataT)0;
static constexpr DataT Zero = (DataT)0;

public:
/**
Expand Down
12 changes: 7 additions & 5 deletions cpp/include/raft/linalg/detail/qr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <algorithm>

namespace raft {
namespace linalg {
namespace detail {
Expand All @@ -37,7 +39,7 @@ void qrGetQ(const raft::handle_t& handle,
cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();

int m = n_rows, n = n_cols;
int k = min(m, n);
int k = std::min(m, n);
RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));

rmm::device_uvector<math_t> tau(k, stream);
Expand Down Expand Up @@ -70,8 +72,8 @@ void qrGetQR(const raft::handle_t& handle,

int m = n_rows, n = n_cols;
rmm::device_uvector<math_t> R_full(m * n, stream);
rmm::device_uvector<math_t> tau(min(m, n), stream);
RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
rmm::device_uvector<math_t> tau(std::min(m, n), stream);
RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream));
int R_full_nrows = m, R_full_ncols = n;
RAFT_CUDA_TRY(
cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
Expand Down Expand Up @@ -100,12 +102,12 @@ void qrGetQR(const raft::handle_t& handle,
int Q_nrows = m, Q_ncols = n;

RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
workspace.resize(Lwork, stream);
RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
Q_nrows,
Q_ncols,
min(Q_ncols, Q_nrows),
std::min(Q_ncols, Q_nrows),
Q,
Q_nrows,
tau.data(),
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down
8 changes: 5 additions & 3 deletions cpp/include/raft/linalg/detail/rsvd.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <raft/matrix/matrix.hpp>
#include <raft/random/rng.hpp>

#include <algorithm>

namespace raft {
namespace linalg {
namespace detail {
Expand Down Expand Up @@ -386,9 +388,9 @@ void rsvdPerc(const raft::handle_t& handle,
int max_sweeps,
cudaStream_t stream)
{
int k = max((int)(min(n_rows, n_cols) * PC_perc),
1); // Number of singular values to be computed
int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1); // Upsamples
int k = std::max((int)(std::min(n_rows, n_cols) * PC_perc),
1); // Number of singular values to be computed
int p = std::max((int)(std::min(n_rows, n_cols) * UpS_perc), 1); // Upsamples
rsvdFixedRank(handle,
M,
n_rows,
Expand Down
10 changes: 6 additions & 4 deletions cpp/include/raft/matrix/detail/linewise_op.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,8 @@
#include <raft/pow2_utils.cuh>
#include <raft/vectorized.cuh>

#include <algorithm>

namespace raft {
namespace matrix {
namespace detail {
Expand Down Expand Up @@ -312,7 +314,7 @@ __global__ void __launch_bounds__(BlockSize)
typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
constexpr uint workSize = L::VecElems * BlockSize;
uint workOffset = workSize;
__shared__ alignas(sizeof(Type) * L::VecElems)
__shared__ __align__(sizeof(Type) * L::VecElems)
Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
return L::vectorRows(
Expand Down Expand Up @@ -422,7 +424,7 @@ void matrixLinewiseVecCols(Type* out,
const uint occupy = getOptimalGridSize<BlockSize>();
// does not make sense to have more blocks than this
const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
const dim3 gs(min(maxBlocks, occupy), 1, 1);
const dim3 gs(std::min(maxBlocks, occupy), 1, 1);
// The work arrangement is blocked on the block and warp levels;
// see more details at Linewise::vectorCols.
// The value below determines how many scalar elements are processed by on thread in total.
Expand Down Expand Up @@ -482,7 +484,7 @@ void matrixLinewiseVecRows(Type* out,
const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
// Minimum size of the grid to make the device well occupied
const uint occupy = getOptimalGridSize<BlockSize>();
const dim3 gs(min(
const dim3 gs(std::min(
// does not make sense to have more blocks than this
raft::ceildiv<uint>(uint(totalLen), block_work_size),
// increase the grid size to be not less than `occupy` while
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/raft/matrix/detail/matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ template <typename m_t, typename idx_t = int>
void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
{
idx_t m = n_rows, n = n_cols;
idx_t k = min(m, n);
idx_t k = std::min(m, n);
dim3 block(64);
dim3 grid((m * n + block.x - 1) / block.x);
getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
Expand All @@ -246,7 +246,7 @@ template <typename m_t, typename idx_t = int>
void initializeDiagonalMatrix(
m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
{
idx_t k = min(n_rows, n_cols);
idx_t k = std::min(n_rows, n_cols);
dim3 block(64);
dim3 grid((k + block.x - 1) / block.x);
copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
Expand Down Expand Up @@ -285,4 +285,4 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st

} // end namespace detail
} // end namespace matrix
} // end namespace raft
} // end namespace raft
2 changes: 1 addition & 1 deletion cpp/include/raft/random/detail/make_blobs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,4 @@ void make_blobs_caller(DataT* out,

} // end namespace detail
} // end namespace random
} // end namespace raft
} // end namespace raft
4 changes: 3 additions & 1 deletion cpp/include/raft/sparse/distance/detail/l2_distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@

#include <nvfunctional>

#include <algorithm>

namespace raft {
namespace sparse {
namespace distance {
Expand Down Expand Up @@ -411,7 +413,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {

void compute(value_t* out_dists)
{
rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
config_->handle.get_stream());

raft::sparse::convert::csr_to_coo(config_->b_indptr,
Expand Down
6 changes: 4 additions & 2 deletions cpp/include/raft/sparse/distance/detail/lp_distance.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@

#include <nvfunctional>

#include <algorithm>

namespace raft {
namespace sparse {
namespace distance {
Expand All @@ -48,7 +50,7 @@ void unexpanded_lp_distances(value_t* out_dists,
accum_f accum_func,
write_f write_func)
{
rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
config_->handle.get_stream());

raft::sparse::convert::csr_to_coo(config_->b_indptr,
Expand Down Expand Up @@ -283,7 +285,7 @@ class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {

void compute(value_t* out_dists)
{
rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
config_->handle.get_stream());

raft::sparse::convert::csr_to_coo(config_->b_indptr,
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/raft/sparse/selection/detail/knn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#include <raft/sparse/op/slice.hpp>
#include <raft/spatial/knn/knn.hpp>

#include <algorithm>

namespace raft {
namespace sparse {
namespace selection {
Expand Down Expand Up @@ -354,7 +356,7 @@ class sparse_knn_t {

// in the case where the number of idx rows in the batch is < k, we
// want to adjust k.
value_idx n_neighbors = min(k, batch_cols);
value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);

bool ascending = true;
if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
Expand Down
Loading

0 comments on commit b2a88c2

Please sign in to comment.