Skip to content

Commit

Permalink
Minimize usage of cub::Traits
Browse files Browse the repository at this point in the history
* Replace all uses of cub::Traits other than radix sort key twiddling by numeric_limits
* Drop obsolete specializations of cub::NumericTraits
* Fix radix sort custom type example mentioning non-existent cub::RadixTraits
* Replace cub::BaseTraits and cub::Traits by aliases so uses can no longer specialize it
* Deprecate cub::Traits::Max|Lowest
* Extend documentation of trait classes

Fixes: NVIDIA#920
  • Loading branch information
bernhardmgruber committed Feb 20, 2025
1 parent 5da16cd commit b9f1b42
Show file tree
Hide file tree
Showing 11 changed files with 77 additions and 148 deletions.
7 changes: 0 additions & 7 deletions c2h/include/c2h/bfloat16.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,6 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<bfloat16_t>
: CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, bfloat16_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
6 changes: 0 additions & 6 deletions c2h/include/c2h/half.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,12 +361,6 @@ public:
};
_LIBCUDACXX_END_NAMESPACE_STD

_CCCL_SUPPRESS_DEPRECATED_PUSH
template <>
struct CUB_NS_QUALIFIER::NumericTraits<half_t> : CUB_NS_QUALIFIER::BaseTraits<FLOATING_POINT, unsigned short, half_t>
{};
_CCCL_SUPPRESS_DEPRECATED_POP

#ifdef __GNUC__
# pragma GCC diagnostic pop
#endif
19 changes: 1 addition & 18 deletions c2h/include/c2h/test_util_vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ C2H_VEC_OVERLOAD(ulonglong, unsigned long long)
C2H_VEC_OVERLOAD(float, float)
C2H_VEC_OVERLOAD(double, double)

// Specialize cub::NumericTraits and cuda::std::numeric_limits for vector types.
// Specialize cuda::std::numeric_limits for vector types.

# define REPEAT_TO_LIST_1(a) a
# define REPEAT_TO_LIST_2(a) a, a
Expand All @@ -298,23 +298,6 @@ C2H_VEC_OVERLOAD(double, double)
# define REPEAT_TO_LIST(N, a) _CCCL_PP_CAT(REPEAT_TO_LIST_, N)(a)

# define C2H_VEC_TRAITS_OVERLOAD_IMPL(T, BaseT, N) \
CUB_NAMESPACE_BEGIN \
template <> \
struct NumericTraits<T> \
{ \
static __host__ __device__ T Max() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Max())}; \
return retval; \
} \
static __host__ __device__ T Lowest() \
{ \
T retval = {REPEAT_TO_LIST(N, NumericTraits<BaseT>::Lowest())}; \
return retval; \
} \
}; \
CUB_NAMESPACE_END \
\
_LIBCUDACXX_BEGIN_NAMESPACE_STD \
template <> \
class numeric_limits<T> \
Expand Down
5 changes: 4 additions & 1 deletion cub/benchmarks/bench/reduce/arg_extrema.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cub/device/device_reduce.cuh>
#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>

#include <cuda/std/limits>
#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -57,7 +58,9 @@ struct policy_hub_t
// Type used for the final result
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;

auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> ? cub::Traits<T>::Max() : cub::Traits<T>::Lowest();
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
? ::cuda::std::numeric_limits<T>::max()
: ::cuda::std::numeric_limits<T>::lowest();

#if !TUNE_BASE
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
Expand Down
23 changes: 10 additions & 13 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@

#include <thrust/iterator/tabulate_output_iterator.h>

#include <cuda/std/limits>

#include <iterator>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -334,7 +336,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide minimum using the less-than (``<``) operator.
//!
//! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction.
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -433,8 +435,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::minimum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
Traits<InitT>::Max(),
::cuda::std::numeric_limits<InitT>::max(),
stream);
}

Expand Down Expand Up @@ -583,7 +584,7 @@ struct DeviceReduce
//! (assuming the value type of ``d_in`` is ``T``)
//!
//! - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``<`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -690,8 +691,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
Expand All @@ -700,7 +700,7 @@ struct DeviceReduce
//! @rst
//! Computes a device-wide maximum using the greater-than (``>``) operator.
//!
//! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
//! (e.g., addition of floating point types) on the same GPU device.
Expand Down Expand Up @@ -796,8 +796,7 @@ struct DeviceReduce
d_out,
static_cast<OffsetT>(num_items),
::cuda::maximum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
Traits<InitT>::Lowest(),
::cuda::std::numeric_limits<InitT>::lowest(),
stream);
}

Expand Down Expand Up @@ -948,7 +947,7 @@ struct DeviceReduce
//!
//! - The maximum is written to ``d_out.value`` and its offset in the input
//! array is written to ``d_out.key``.
//! - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//!
//! - Does not support ``>`` operators that are non-commutative.
//! - Provides "run-to-run" determinism for pseudo-associative reduction
Expand Down Expand Up @@ -1057,9 +1056,7 @@ struct DeviceReduce
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};

return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
Expand Down
21 changes: 9 additions & 12 deletions cub/cub/device/device_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/limits>
#include <cuda/std/type_traits>

#include <iterator>
Expand Down Expand Up @@ -392,7 +393,7 @@ public:
//! @rst
//! Computes a device-wide segmented minimum using the less-than (``<``) operator.
//!
//! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
//! - When input a contiguous sequence of segments, a single sequence
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
Expand Down Expand Up @@ -508,8 +509,7 @@ public:
d_begin_offsets,
d_end_offsets,
::cuda::minimum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::max() (breaking change)
Traits<InputT>::Max(),
::cuda::std::numeric_limits<InputT>::max(),
stream);
}

Expand All @@ -522,7 +522,7 @@ public:
//!
//! - The minimum of the *i*\ :sup:`th` segment is written to
//! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
//! - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
//!
//! - When input a contiguous sequence of segments, a single sequence
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
Expand Down Expand Up @@ -636,8 +636,7 @@ public:
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};

using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
Expand Down Expand Up @@ -666,7 +665,7 @@ public:
//! @rst
//! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
//!
//! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
//! - When input a contiguous sequence of segments, a single sequence
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
Expand Down Expand Up @@ -771,8 +770,7 @@ public:
d_begin_offsets,
d_end_offsets,
::cuda::maximum<>{},
// TODO(bgruber): replace with ::cuda::std::numeric_limits<T>::lowest() (breaking change)
Traits<InputT>::Lowest(),
::cuda::std::numeric_limits<InputT>::lowest(),
stream);
}

Expand All @@ -785,7 +783,7 @@ public:
//!
//! - The maximum of the *i*\ :sup:`th` segment is written to
//! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
//! - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
//!
//! - When input a contiguous sequence of segments, a single sequence
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
Expand Down Expand Up @@ -902,8 +900,7 @@ public:
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
// TODO Address https://github.com/NVIDIA/cub/issues/651
InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};

using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
Expand Down
Loading

0 comments on commit b9f1b42

Please sign in to comment.