Minimize usage of cub::Traits

* Replace all uses of cub::Traits other than radix sort key twiddling by numeric_limits * Drop obsolete specializations of cub::NumericTraits * Fix radix sort custom type example mentioning non-existent cub::RadixTraits * Replace cub::BaseTraits and cub::Traits by aliases so uses can no longer specialize it * Deprecate cub::Traits::Max|Lowest * Extend documentation of trait classes Fixes: NVIDIA#920
bernhardmgruber · Feb 20, 2025 · b9f1b42 · b9f1b42
1 parent 5da16cd
commit b9f1b42
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 148 deletions.
diff --git a/c2h/include/c2h/bfloat16.cuh b/c2h/include/c2h/bfloat16.cuh
@@ -266,13 +266,6 @@ public:
 };
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-template <>
-struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
-    : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
-{};
-_CCCL_SUPPRESS_DEPRECATED_POP
-
 #ifdef __GNUC__
 #  pragma GCC diagnostic pop
 #endif
diff --git a/c2h/include/c2h/half.cuh b/c2h/include/c2h/half.cuh
@@ -361,12 +361,6 @@ public:
 };
 _LIBCUDACXX_END_NAMESPACE_STD
 
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-template <>
-struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
-{};
-_CCCL_SUPPRESS_DEPRECATED_POP
-
 #ifdef __GNUC__
 #  pragma GCC diagnostic pop
 #endif
diff --git a/c2h/include/c2h/test_util_vec.h b/c2h/include/c2h/test_util_vec.h
@@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
 C2H_VEC_OVERLOAD(float, float)
 C2H_VEC_OVERLOAD(double, double)
 
-// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
+// Specialize cuda::std::numeric_limits for vector types.
 
 #  define REPEAT_TO_LIST_1(a)  a
 #  define REPEAT_TO_LIST_2(a)  a, a
@@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
 #  define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)
 
 #  define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N)                               \
-    CUB_NAMESPACE_BEGIN                                                           \
-    template <>                                                                   \
-    struct NumericTraits<T>                                                       \
-    {                                                                             \
-      static __host__ __device__ T Max()                                          \
-      {                                                                           \
-        T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())};              \
-        return retval;                                                            \
-      }                                                                           \
-      static __host__ __device__ T Lowest()                                       \
-      {                                                                           \
-        T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())};           \
-        return retval;                                                            \
-      }                                                                           \
-    };                                                                            \
-    CUB_NAMESPACE_END                                                             \
-                                                                                  \
     _LIBCUDACXX_BEGIN_NAMESPACE_STD                                               \
     template <>                                                                   \
     class numeric_limits<T>                                                       \

diff --git a/cub/benchmarks/bench/reduce/arg_extrema.cu b/cub/benchmarks/bench/reduce/arg_extrema.cu
@@ -4,6 +4,7 @@
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <nvbench_helper.cuh>
@@ -57,7 +58,9 @@ struct policy_hub_t
     // Type used for the final result
     using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
 
-    auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
+    auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
+                      ? ::cuda::std::numeric_limits<T>::max()
+                      : ::cuda::std::numeric_limits<T>::lowest();
 
 #if !TUNE_BASE
     using policy_t   = policy_hub_t<output_tuple_t, per_partition_offset_t>;

diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
@@ -51,6 +51,8 @@
 
 #include <thrust/iterator/tabulate_output_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <iterator>
 
 CUB_NAMESPACE_BEGIN
@@ -334,7 +336,7 @@ struct DeviceReduce
   //! @rst
   //! Computes a device-wide minimum using the less-than (``<``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
+  //! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
   //! - Does not support ``<`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
@@ -433,8 +435,7 @@ struct DeviceReduce
       d_out,
       static_cast<OffsetT>(num_items),
       ::cuda::minimum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
-      Traits<InitT>::Max(),
+      ::cuda::std::numeric_limits<InitT>::max(),
       stream);
   }
 
@@ -583,7 +584,7 @@ struct DeviceReduce
   //!   (assuming the value type of ``d_in`` is ``T``)
   //!
   //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
-  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
   //!
   //! - Does not support ``<`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -690,8 +691,7 @@ struct DeviceReduce
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
@@ -700,7 +700,7 @@ struct DeviceReduce
   //! @rst
   //! Computes a device-wide maximum using the greater-than (``>``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
   //!   (e.g., addition of floating point types) on the same GPU device.
@@ -796,8 +796,7 @@ struct DeviceReduce
       d_out,
       static_cast<OffsetT>(num_items),
       ::cuda::maximum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
-      Traits<InitT>::Lowest(),
+      ::cuda::std::numeric_limits<InitT>::lowest(),
       stream);
   }
 
@@ -948,7 +947,7 @@ struct DeviceReduce
   //!
   //!   - The maximum is written to ``d_out.value`` and its offset in the input
   //!     array is written to ``d_out.key``.
-  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
   //!
   //! - Does not support ``>`` operators that are non-commutative.
   //! - Provides "run-to-run" determinism for pseudo-associative reduction
@@ -1057,9 +1056,7 @@ struct DeviceReduce
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
       d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);

diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh
@@ -49,6 +49,7 @@
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
 #include <iterator>
@@ -392,7 +393,7 @@ public:
   //! @rst
   //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
+  //! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
   //! - When input a contiguous sequence of segments, a single sequence
   //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
   //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
@@ -508,8 +509,7 @@ public:
       d_begin_offsets,
       d_end_offsets,
       ::cuda::minimum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
-      Traits<InputT>::Max(),
+      ::cuda::std::numeric_limits<InputT>::max(),
       stream);
   }
 
@@ -522,7 +522,7 @@ public:
   //!
   //!   - The minimum of the *i*\ :sup:`th` segment is written to
   //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
-  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
   //!
   //! - When input a contiguous sequence of segments, a single sequence
   //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
@@ -636,8 +636,7 @@ public:
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
 
     using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
     static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
@@ -666,7 +665,7 @@ public:
   //! @rst
   //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
   //!
-  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
   //! - When input a contiguous sequence of segments, a single sequence
   //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
   //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
@@ -771,8 +770,7 @@ public:
       d_begin_offsets,
       d_end_offsets,
       ::cuda::maximum<>{},
-      // TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
-      Traits<InputT>::Lowest(),
+      ::cuda::std::numeric_limits<InputT>::lowest(),
       stream);
   }
 
@@ -785,7 +783,7 @@ public:
   //!
   //!   - The maximum of the *i*\ :sup:`th` segment is written to
   //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
-  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!   - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
   //!
   //! - When input a contiguous sequence of segments, a single sequence
   //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
@@ -902,8 +900,7 @@ public:
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
     // Initial value
-    // TODO Address https://github.com/NVIDIA/cub/issues/651
-    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+    InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
 
     using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
     static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");