Removing some relaxed constexpr requirements

NVIDIA · May 31, 2024 · 8b099d7 · 8b099d7
1 parent 9d2ad17
commit 8b099d7
Show file tree

Hide file tree

Showing 22 changed files with 85 additions and 89 deletions.
diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h
@@ -43,6 +43,8 @@
 
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
+#include <cuda/std/__algorithm>
+#include <cuda/std/__algorithm>
 
 #pragma once
 
@@ -203,7 +205,7 @@ struct MemTracker {
     [[maybe_unused]] std::unique_lock lck(memory_mtx);
     matxMemoryStats.currentBytesAllocated += bytes;
     matxMemoryStats.totalBytesAllocated += bytes;
-    matxMemoryStats.maxBytesAllocated = std::max(
+    matxMemoryStats.maxBytesAllocated = cuda::std::max(
         matxMemoryStats.maxBytesAllocated, matxMemoryStats.currentBytesAllocated);
     allocationMap[*ptr] = {bytes, space, stream};
   }

diff --git a/include/matx/core/half_complex.h b/include/matx/core/half_complex.h
@@ -147,26 +147,6 @@ template <typename T> struct alignas(sizeof(T) * 2) matxHalfComplex {
     return {x, y};
   }
 
-  /**
-   * @brief std::complex<float> cast operator
-   * 
-   * @return std::complex<float> value
-   */
-  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ operator std::complex<float>()
-  {
-    return {x, y};
-  }
-
-  /**
-   * @brief std::complex<double> cast operator
-   * 
-   * @return std::complex<double> value
-   */
-  __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ operator std::complex<double>()
-  {
-    return {x, y};
-  }
-
   /**
    * @brief Copy assignment operator
    * 

diff --git a/include/matx/core/pybind.h b/include/matx/core/pybind.h
@@ -437,12 +437,14 @@ class MatXPybind {
   }
 
   template <typename TensorType,
-            typename CT = matx_convert_complex_type<typename TensorType::scalar_type>>
+            typename CT = matx_convert_cuda_complex_type<typename TensorType::scalar_type>>
   std::optional<TestFailResult<CT>>
   CompareOutput(const TensorType &ten,
                 const std::string fname, double thresh, bool debug = false)
   {
-    using ntype = matx_convert_complex_type<typename TensorType::scalar_type>;
+    using raw_type = typename TensorType::scalar_type;    
+    using ntype = matx_convert_complex_type<raw_type>;
+    using ctype = matx_convert_cuda_complex_type<raw_type>;
     auto resobj = res_dict[fname.c_str()];
     auto ften = pybind11::array_t<ntype>(resobj);
     constexpr int RANK = TensorType::Rank();
@@ -453,7 +455,7 @@ class MatXPybind {
       auto file_val = ften.at();
       auto ten_val = ConvertComplex(ten());
       if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
-        return TestFailResult<ntype>{Index2Str(0), "0", ten_val, file_val,
+        return TestFailResult<ctype>{Index2Str(0), "0", ten_val, file_val,
                                      thresh};
       }
     }
@@ -468,7 +470,7 @@ class MatXPybind {
                     auto file_val = ften.at(s1, s2, s3, s4);
                     auto ten_val = ConvertComplex(ten(s1, s2, s3, s4));
                     if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
-                      return TestFailResult<ntype>{Index2Str(s1, s2, s3, s4),
+                      return TestFailResult<ctype>{Index2Str(s1, s2, s3, s4),
                                                    fname, ten_val, file_val,
                                                    thresh};
                     }
@@ -478,7 +480,7 @@ class MatXPybind {
                   auto file_val = ften.at(s1, s2, s3);
                   auto ten_val = ConvertComplex(ten(s1, s2, s3));
                   if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
-                    return TestFailResult<ntype>{Index2Str(s1, s2, s3), fname,
+                    return TestFailResult<ctype>{Index2Str(s1, s2, s3), fname,
                                                  ten_val, file_val, thresh};
                   }
                 }
@@ -488,7 +490,7 @@ class MatXPybind {
               auto file_val = ften.at(s1, s2);
               auto ten_val = ConvertComplex(ten(s1, s2));
               if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
-                return TestFailResult<ntype>{Index2Str(s1, s2), fname, ten_val,
+                return TestFailResult<ctype>{Index2Str(s1, s2), fname, ten_val,
                                              file_val, thresh};
               }
             }
@@ -498,7 +500,7 @@ class MatXPybind {
           auto file_val = ften.at(s1);
           auto ten_val = ConvertComplex(ten(s1));
           if (!CompareVals(ten_val, file_val, thresh, fname, debug)) {
-            return TestFailResult<ntype>{Index2Str(s1), fname, ten_val,
+            return TestFailResult<ctype>{Index2Str(s1), fname, ten_val,
                                          file_val, thresh};
           }
         }

diff --git a/include/matx/core/tensor_desc.h b/include/matx/core/tensor_desc.h
@@ -373,15 +373,15 @@ class static_tensor_desc_t {
    * @param dim Dimension to retrieve
    * @return Size of dimension
    */
-  static constexpr auto Size(int dim) { return shape_[dim]; }
+  static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size(int dim) { return shape_[dim]; }
 
   /**
    * @brief Get stride of dimension
    * 
    * @param dim Dimension to retrieve
    * @return Stride of dimension
    */  
-  static constexpr auto Stride(int dim) { return stride_[dim]; }
+  static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Stride(int dim) { return stride_[dim]; }
 
   /**
    * @brief Return strides contaienr of descriptor

diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h
@@ -99,7 +99,7 @@ namespace matx
 
     for (int i = 1; i < op.Rank(); i++)
     {
-      maxSize = std::max(op.Size(i), maxSize);
+      maxSize = cuda::std::max(op.Size(i), maxSize);
     }
 
     return maxSize;

diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
@@ -800,11 +800,22 @@ struct complex_type_of
                                                typename C::value_type>>> {
 };
 
+template <class C>
+struct cuda_complex_type_of
+    : identity<cuda::std::complex<std::conditional_t<is_complex_half_v<C>, float,
+                                               typename C::value_type>>> {
+};
+
 template <class C>
 using matx_convert_complex_type =
     typename std::conditional_t<!is_complex_v<C>, identity<C>,
                                 complex_type_of<C>>::type;
 
+template <class C>
+using matx_convert_cuda_complex_type =
+    typename std::conditional_t<!is_complex_v<C>, identity<C>,
+                                cuda_complex_type_of<C>>::type;                                
+
 
 template <class T, class = void> struct value_type {
   using type = T;

diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh
@@ -80,7 +80,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
 
     constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
     const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
-    const index_t last_out_elem = std::min(
+    const index_t last_out_elem = cuda::std::min(
         output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);
 
     if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
@@ -103,7 +103,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
 
     if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
         for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
-            const index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
+            const index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
             output_t accum {};
             const filter_t *h = smem_filter;
             // index_t in MatX should be signed (32 or 64 bit), so j-- below will not underflow
@@ -134,7 +134,7 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
         }
     } else {
         for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
-            index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
+            index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
             // If we use the last filter tap for this phase (which is the first index because
             // the filter is flipped), then it may be a padded zero. If so, increment first_ind
             // by 1 to avoid using the zero. This prevents a bounds-check in the inner loop.
@@ -227,7 +227,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
     const uint32_t smem_input_height = filter_phase_len + by - 1;
 
     const index_t start_elem = blockIdx.x * elems_per_channel_per_cta;
-    const index_t last_elem = std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
+    const index_t last_elem = cuda::std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
     auto indims = BlockToIdx(input, blockIdx.z, 1);
     auto outdims = BlockToIdx(output, blockIdx.z, 2);
     outdims[ChannelRank] = chan;
@@ -256,7 +256,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
         __syncthreads();
 
         // Load next elems_per_channel_per_cta elements for each channel
-        const index_t next_last_elem = std::min(next_start_elem + by - 1, last_elem);
+        const index_t next_last_elem = cuda::std::min(next_start_elem + by - 1, last_elem);
         const uint32_t out_samples_this_iter = static_cast<uint32_t>(next_last_elem - next_start_elem + 1);
         if (ty < out_samples_this_iter) {
             indims[InRank-1] = (next_start_elem + ty) * num_channels + chan;
@@ -286,7 +286,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
         if (outdims[OutElemRank] <= last_elem) {
             const filter_t *h = h_start;
             output_t accum { 0 };
-            const int first_end = std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
+            const int first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
             // The footprint of samples involved in the convolution may wrap from the end
             // to the beginning of smem_input. The prologue below handles the samples from
             // the current tail to the end of smem_input and the epilogue starts back at the
@@ -342,7 +342,7 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
 
     constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
     const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
-    const index_t last_out_elem = std::min(
+    const index_t last_out_elem = cuda::std::min(
         output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);
 
     // Pre-compute the DFT complex exponentials and store in shared memory
@@ -371,7 +371,7 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
         for (int i = 0; i < NUM_CHAN; i++) {
             accum[i] = static_cast<output_t>(0);
         }
-        index_t first_ind = std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
+        index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
         indims[InRank-1] = t * NUM_CHAN + NUM_CHAN - 1;
         index_t j_start = t;
         index_t h_ind { 0 };

diff --git a/include/matx/kernels/filter.cuh b/include/matx/kernels/filter.cuh
@@ -23,7 +23,7 @@
 
 #define COMPLEX_TYPE cuComplex
 
-// std::max/min isn't working on template value parameters
+// cuda::std::max/min isn't working on template value parameters
 #define MAX(a, b) ((a) < (b) ? (b) : (a))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 

diff --git a/include/matx/kernels/resample_poly.cuh b/include/matx/kernels/resample_poly.cuh
@@ -189,7 +189,7 @@ __global__ void ResamplePoly1D_PhaseBlock(OutType output, InType input, FilterTy
     const index_t max_input_ind = input_len - 1;
 
     const index_t start_ind = phase_ind + up * (tid  + elem_block * elems_per_thread * THREADS);
-    const index_t last_ind = std::min(output_len - 1, start_ind + elems_per_thread * THREADS * up);
+    const index_t last_ind = cuda::std::min(output_len - 1, start_ind + elems_per_thread * THREADS * up);
     for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS * up) {
         // out_ind is the index in the output array and up_ind = out_ind * down is the
         // corresponding index in the upsampled array
@@ -203,9 +203,9 @@ __global__ void ResamplePoly1D_PhaseBlock(OutType output, InType input, FilterTy
         // of valid samples before input_ind. In the case that the filter is not
         // long enough to include input_ind, last_filter_ind is left_filter_ind - up
         // and thus left_h_ind and prologue are both -1.
-        const index_t prologue = std::min(input_ind, left_h_ind);
+        const index_t prologue = cuda::std::min(input_ind, left_h_ind);
         // epilogue is the number of valid samples after input_ind.
-        const index_t epilogue = std::min(max_input_ind - input_ind, max_h_epilogue);
+        const index_t epilogue = cuda::std::min(max_input_ind - input_ind, max_h_epilogue);
         // n is the number of valid samples. If input_ind is not valid because it
         // precedes the reach of the filter, then prologue = -1 and n is just the
         // epilogue.
@@ -302,12 +302,12 @@ __global__ void ResamplePoly1D_ElemBlock(OutType output, InType input, FilterTyp
     // whether or not the filter has been loaded to shared memory.
     const index_t filter_central_tap = (filter_len-1)/2;
     const index_t start_ind = elem_block * elems_per_thread * THREADS + tid;
-    const index_t last_ind = std::min(output_len - 1, start_ind + (elems_per_thread-1) * THREADS);
+    const index_t last_ind = cuda::std::min(output_len - 1, start_ind + (elems_per_thread-1) * THREADS);
     if (load_filter_to_smem) {
         for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS) {
             const index_t up_ind = out_ind * down;
-            const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
-            const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
+            const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
+            const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
             const index_t x_start = (up_start + up - 1) / up;
             index_t x_end = up_end / up;
             // Since the filter is in shared memory, we can narrow the index type to 32 bits
@@ -333,8 +333,8 @@ __global__ void ResamplePoly1D_ElemBlock(OutType output, InType input, FilterTyp
     } else {
         for (index_t out_ind = start_ind; out_ind <= last_ind; out_ind += THREADS) {
             const index_t up_ind = out_ind * down;
-            const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
-            const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
+            const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
+            const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
             const index_t x_start = (up_start + up - 1) / up;
             index_t x_end = up_end / up;
             index_t h_ind = filter_central_tap + (up_ind - up*x_start);            
@@ -409,12 +409,12 @@ __global__ void ResamplePoly1D_WarpCentric(OutType output, InType input, FilterT
     const index_t filter_len_half = filter_len/2;
     const index_t filter_central_tap = (filter_len-1)/2;
     const index_t start_ind = elem_block * elems_per_warp * NUM_WARPS;
-    const index_t last_ind = std::min(output_len - 1, start_ind + elems_per_warp * NUM_WARPS - 1);
+    const index_t last_ind = cuda::std::min(output_len - 1, start_ind + elems_per_warp * NUM_WARPS - 1);
     if (load_filter_to_smem) {
         for (index_t out_ind = start_ind+warp_id; out_ind <= last_ind; out_ind += NUM_WARPS) {
             const index_t up_ind = out_ind * down;
-            const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
-            const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
+            const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
+            const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
             const index_t x_start = (up_start + up - 1) / up;
             index_t x_end = up_end / up;
             // Since the filter is in shared memory, we can narrow the index type to 32 bits
@@ -449,8 +449,8 @@ __global__ void ResamplePoly1D_WarpCentric(OutType output, InType input, FilterT
     } else {
         for (index_t out_ind = start_ind+warp_id; out_ind <= last_ind; out_ind += NUM_WARPS) {
             const index_t up_ind = out_ind * down;
-            const index_t up_start = std::max(static_cast<index_t>(0), up_ind - filter_len_half);
-            const index_t up_end = std::min(max_input_ind * up, up_ind + filter_len_half);
+            const index_t up_start = cuda::std::max(static_cast<index_t>(0), up_ind - filter_len_half);
+            const index_t up_end = cuda::std::min(max_input_ind * up, up_ind + filter_len_half);
             const index_t x_start = (up_start + up - 1) / up;
             index_t x_end = up_end / up;
             index_t h_ind = filter_central_tap + (up_ind - up*x_start);            

diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h
@@ -46,7 +46,7 @@ namespace matx
       private:
         using out_t = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
               typename OpA::scalar_type, typename OpB::scalar_type>;
-        constexpr static int max_rank = std::max(OpA::Rank(), OpB::Rank());
+        constexpr static int max_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
         OpA a_;
         OpB b_;
         matxConvCorrMode_t mode_;
@@ -82,8 +82,8 @@ namespace matx
             for (int r = 0; r < Rank(); r++) {
               const int axis = perm[r];
               if (axis == Rank() - 1) {
-                max_axis = std::max(a_.Size(r), b_.Size(r));
-                min_axis = std::min(a_.Size(r), b_.Size(r));
+                max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
+                min_axis = cuda::std::min(a_.Size(r), b_.Size(r));
 
                 if (mode_ == MATX_C_MODE_FULL) {
                   out_dims_[axis] = a_.Size(r) + b_.Size(r) - 1;
@@ -112,8 +112,8 @@ namespace matx
               }
             }
 
-            max_axis = std::max(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
-            min_axis = std::min(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
+            max_axis = cuda::std::max(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
+            min_axis = cuda::std::min(a_.Size(OpA::Rank()-1), b_.Size(OpB::Rank()-1));
 
             if (mode_ == MATX_C_MODE_FULL) {
               out_dims_[max_rank-1] = max_axis + min_axis - 1;
@@ -231,7 +231,7 @@ namespace detail {
     private:
       using out_t = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
             typename OpA::scalar_type, typename OpB::scalar_type>;
-      constexpr static int max_rank = std::max(OpA::Rank(), OpB::Rank());
+      constexpr static int max_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
       OpA a_;
       OpB b_;
       matxConvCorrMode_t mode_;
@@ -257,8 +257,8 @@ namespace detail {
           for (int r = 0; r < Rank(); r++) {
             const int axis = perm[r];
             if (axis >= Rank() - 2) {
-              const auto max_axis = std::max(a_.Size(r), b_.Size(r));
-              const auto min_axis = std::min(a_.Size(r), b_.Size(r));          
+              const auto max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
+              const auto min_axis = cuda::std::min(a_.Size(r), b_.Size(r));          
               if (mode_ == MATX_C_MODE_FULL) {
                 out_dims_[axis] = a_.Size(r) + b_.Size(r) - 1;
               }
@@ -287,8 +287,8 @@ namespace detail {
           }
 
           for (int r = max_rank - 2; r < max_rank; r++) {
-            const auto max_axis = std::max(a_.Size(r), b_.Size(r));
-            const auto min_axis = std::min(a_.Size(r), b_.Size(r));
+            const auto max_axis = cuda::std::max(a_.Size(r), b_.Size(r));
+            const auto min_axis = cuda::std::min(a_.Size(r), b_.Size(r));
             if (mode_ == MATX_C_MODE_FULL) {
               out_dims_[r] = max_axis + min_axis - 1;
             }