make raft sources compilable with clang (#424)

This makes RAFT sources compilable with clang. It fixes some fragile code (using `static const` instead of `static constexpr` or `%laneid` in PTX relying on quirks in nvcc which make this happen). RAFT is still not compilable with clang entirely though due to the dependencies: 1. cub has this issue before 1.14: NVIDIA/cub#335 2. libcudacxx has issues with atomic, which should be fixed in >= 1.7.0-ea (wasn't able to verify this yet) 3. libcudacxx has issues with variadic CUDA functions, which is apparently fixed by passing `-Xclang -fcuda-allow-variadic-functions` to clang (wasn't able to verify this yet) 3. cooperative_groups from CUDA does not work with clang 11.0 / 11.1 but only with >= 13 EDIT: this is necessary to close #84 Authors: - Matt Joux (https://github.com/MatthiasKohl) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Artem M. Chirkin (https://github.com/achirkin) URL: #424
rapidsai · Feb 16, 2022 · b2a88c2 · b2a88c2
1 parent ad3af3e
commit b2a88c2
Show file tree

Hide file tree

Showing 23 changed files with 824 additions and 72 deletions.
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <ctime>
@@ -28,6 +29,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
@@ -404,8 +406,8 @@ static int chooseNewCentroid(handle_t const& handle,
   //}
 
   RAFT_CHECK_CUDA(stream);
-  obsIndex = max(obsIndex, 0);
-  obsIndex = min(obsIndex, n - 1);
+  obsIndex = std::max(obsIndex, static_cast<index_type_t>(0));
+  obsIndex = std::min(obsIndex, n - 1);
 
   // Record new centroid position
   RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
@@ -467,7 +469,7 @@ static int initializeCentroids(handle_t const& handle,
   auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   // -------------------------------------------------------
   // Implementation
@@ -477,12 +479,12 @@ static int initializeCentroids(handle_t const& handle,
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+  dim3 gridDim_warp{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
                     1,
-                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+                    std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
+  dim3 gridDim_block{std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
   RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
@@ -574,10 +576,10 @@ static int assignCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
   dim3 gridDim;
-  constexpr index_type_t grid_lower_bound{65535};
-  gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
-  gridDim.y = min(k, grid_lower_bound);
-  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  constexpr unsigned grid_lower_bound{65535};
+  gridDim.x = std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound);
+  gridDim.y = std::min(static_cast<unsigned>(k), grid_lower_bound);
+  gridDim.z = std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound);
 
   computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
   RAFT_CHECK_CUDA(stream);
@@ -587,7 +589,7 @@ static int assignCentroids(handle_t const& handle,
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.x  = std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound);
   gridDim.y  = 1;
   gridDim.z  = 1;
   minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
@@ -644,7 +646,7 @@ static int updateCentroids(handle_t const& handle,
   const value_type_t one  = 1;
   const value_type_t zero = 0;
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   auto stream             = handle.get_stream();
   auto cublas_h           = handle.get_cublas_handle();
@@ -717,8 +719,8 @@ static int updateCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+  dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
+               std::min(ceildiv<unsigned>(k, BSIZE_DIV_WSIZE), grid_lower_bound),
                1};
 
   divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
@@ -791,7 +793,7 @@ int kmeans(handle_t const& handle,
   // Current iteration
   index_type_t iter;
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   // Residual sum of squares at previous iteration
   value_type_t residualPrev = 0;
@@ -818,10 +820,9 @@ int kmeans(handle_t const& handle,
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
-    dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-      1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
+    dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
+                 1,
+                 std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
@@ -958,7 +959,7 @@ int kmeans(handle_t const& handle,
   // Allocate memory
   raft::spectral::matrix::vector_t<index_type_t> clusterSizes(handle, k);
   raft::spectral::matrix::vector_t<value_type_t> centroids(handle, d * k);
-  raft::spectral::matrix::vector_t<value_type_t> work(handle, n * max(k, d));
+  raft::spectral::matrix::vector_t<value_type_t> work(handle, n * std::max(k, d));
   raft::spectral::matrix::vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means

diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ static const int WarpSize = 32;
 DI int laneId()
 {
   int id;
-  asm("mov.s32 %0, %laneid;" : "=r"(id));
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
   return id;
 }
 
@@ -228,13 +228,13 @@ DI T myAtomicMax(T* address, T val);
 
 DI float myAtomicMin(float* address, float val)
 {
-  myAtomicReduce(address, val, fminf);
+  myAtomicReduce<float(float, float)>(address, val, fminf);
   return *address;
 }
 
 DI float myAtomicMax(float* address, float val)
 {
-  myAtomicReduce(address, val, fmaxf);
+  myAtomicReduce<float(float, float)>(address, val, fmaxf);
   return *address;
 }
 

diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
@@ -24,6 +24,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace label {
 namespace detail {
@@ -56,7 +58,7 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
     NULL, bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
   cub::DeviceSelect::Unique(
     NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n, stream);
-  bytes = max(bytes, bytes2);
+  bytes = std::max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes

diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -76,7 +76,7 @@ struct Contractions_NT {
   /** block of Y data loaded from global mem after `ldgXY()` */
   DataT ldgDataY[P::LdgPerThY][P::Veclen];
 
-  static const DataT Zero = (DataT)0;
+  static constexpr DataT Zero = (DataT)0;
 
  public:
   /**

diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
@@ -22,6 +22,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -37,7 +39,7 @@ void qrGetQ(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
-  int k = min(m, n);
+  int k = std::min(m, n);
   RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   rmm::device_uvector<math_t> tau(k, stream);
@@ -70,8 +72,8 @@ void qrGetQR(const raft::handle_t& handle,
 
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
-  rmm::device_uvector<math_t> tau(min(m, n), stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  rmm::device_uvector<math_t> tau(std::min(m, n), stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
@@ -100,12 +102,12 @@ void qrGetQR(const raft::handle_t& handle,
   int Q_nrows = m, Q_ncols = n;
 
   RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
+    cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
                                     Q_nrows,
                                     Q_ncols,
-                                    min(Q_ncols, Q_nrows),
+                                    std::min(Q_ncols, Q_nrows),
                                     Q,
                                     Q_nrows,
                                     tau.data(),

diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -26,6 +26,8 @@
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -386,9 +388,9 @@ void rsvdPerc(const raft::handle_t& handle,
               int max_sweeps,
               cudaStream_t stream)
 {
-  int k = max((int)(min(n_rows, n_cols) * PC_perc),
-              1);  // Number of singular values to be computed
-  int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
+  int k = std::max((int)(std::min(n_rows, n_cols) * PC_perc),
+                   1);  // Number of singular values to be computed
+  int p = std::max((int)(std::min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
   rsvdFixedRank(handle,
                 M,
                 n_rows,

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
+#include <algorithm>
+
 namespace raft {
 namespace matrix {
 namespace detail {
@@ -312,7 +314,7 @@ __global__ void __launch_bounds__(BlockSize)
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize = L::VecElems * BlockSize;
   uint workOffset         = workSize;
-  __shared__ alignas(sizeof(Type) * L::VecElems)
+  __shared__ __align__(sizeof(Type) * L::VecElems)
     Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
   return L::vectorRows(
@@ -422,7 +424,7 @@ void matrixLinewiseVecCols(Type* out,
     const uint occupy = getOptimalGridSize<BlockSize>();
     // does not make sense to have more blocks than this
     const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
-    const dim3 gs(min(maxBlocks, occupy), 1, 1);
+    const dim3 gs(std::min(maxBlocks, occupy), 1, 1);
     // The work arrangement is blocked on the block and warp levels;
     //   see more details at Linewise::vectorCols.
     // The value below determines how many scalar elements are processed by on thread in total.
@@ -482,7 +484,7 @@ void matrixLinewiseVecRows(Type* out,
     const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
     // Minimum size of the grid to make the device well occupied
     const uint occupy = getOptimalGridSize<BlockSize>();
-    const dim3 gs(min(
+    const dim3 gs(std::min(
                     // does not make sense to have more blocks than this
                     raft::ceildiv<uint>(uint(totalLen), block_work_size),
                     // increase the grid size to be not less than `occupy` while

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -220,7 +220,7 @@ template <typename m_t, typename idx_t = int>
 void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t m = n_rows, n = n_cols;
-  idx_t k = min(m, n);
+  idx_t k = std::min(m, n);
   dim3 block(64);
   dim3 grid((m * n + block.x - 1) / block.x);
   getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
@@ -246,7 +246,7 @@ template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(
   m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  idx_t k = min(n_rows, n_cols);
+  idx_t k = std::min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
   copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
@@ -285,4 +285,4 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 
 }  // end namespace detail
 }  // end namespace matrix
-}  // end namespace raft
+}  // end namespace raft
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -245,4 +245,4 @@ void make_blobs_caller(DataT* out,
 
 }  // end namespace detail
 }  // end namespace random
-}  // end namespace raft
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -31,6 +31,8 @@
 
 #include <nvfunctional>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace distance {
@@ -411,7 +413,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr,

diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -32,6 +32,8 @@
 
 #include <nvfunctional>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace distance {
@@ -48,7 +50,7 @@ void unexpanded_lp_distances(value_t* out_dists,
                              accum_f accum_func,
                              write_f write_func)
 {
-  rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+  rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
   raft::sparse::convert::csr_to_coo(config_->b_indptr,
@@ -283,7 +285,7 @@ class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr,

diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -31,6 +31,8 @@
 #include <raft/sparse/op/slice.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace selection {
@@ -354,7 +356,7 @@ class sparse_knn_t {
 
     // in the case where the number of idx rows in the batch is < k, we
     // want to adjust k.
-    value_idx n_neighbors = min(k, batch_cols);
+    value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
 
     bool ascending = true;
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;