From cedaee6a979240a1ee58a3d92e2c5cdfab163329 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 31 May 2022 15:38:55 +0400 Subject: [PATCH 1/4] Reformat reduce docs --- cub/device/device_reduce.cuh | 1652 ++++++++++++++++++++-------------- 1 file changed, 978 insertions(+), 674 deletions(-) diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 9f70a111a4..98335521da 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -1,7 +1,6 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -14,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -28,706 +27,1011 @@ ******************************************************************************/ /** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + * @file cub::DeviceReduce provides device-wide, parallel operations for + * computing a reduction across a sequence of data items residing within + * device-accessible memory. */ #pragma once -#include #include #include -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../config.cuh" +#include +#include +#include +#include CUB_NAMESPACE_BEGIN /** - * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SingleModule + * @brief DeviceReduce provides device-wide, parallel operations for computing + * a reduction across a sequence of data items residing within + * device-accessible memory. ![](reduce_logo.png) + * @ingroup SingleModule * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * @par Overview + * A *reduction* + * (or *fold*) uses a binary combining operator to compute a single aggregate + * from a sequence of input elements. * - * \par Usage Considerations - * \cdp_class{DeviceReduce} + * @par Usage Considerations + * @cdp_class{DeviceReduce} * - * \par Performance - * \linear_performance{reduction, reduce-by-key, and run-length encode} + * @par Performance + * @linear_performance{reduction, reduce-by-key, and run-length encode} * - * \par + * @par * The following chart illustrates DeviceReduce::Sum * performance across different CUDA architectures for \p int32 keys. * - * \image html reduce_int32.png + * @image html reduce_int32.png * - * \par + * @par * The following chart illustrates DeviceReduce::ReduceByKey (summation) - * performance across different CUDA architectures for \p fp32 - * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * performance across different CUDA architectures for `fp32` values. Segments + * are identified by `int32` keys, and have lengths uniformly sampled + * from `[1, 1000]`. * - * \image html reduce_by_key_fp32_len_500.png + * @image html reduce_by_key_fp32_len_500.png * - * \par - * \plots_below + * @par + * @plots_below * */ struct DeviceReduce { - /** - * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * __device__ __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * CustomMin min_op; - * int init; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ReductionOpT, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - T init, ///< [in] Initial value of the reduction - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - reduction_op, - init, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide sum using the addition (\p +) operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction. - * - Does not support \p + operators that are non-commutative.. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sum-reduction performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html reduce_int32.png - * \image html reduce_int64.png - * - * \par Snippet - * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The output value type - using OutputT = - cub::detail::non_void_value_t>; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction. - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input value type - using InputT = cub::detail::value_t; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // d_out <-- [{5, 0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input type - using InputValueT = cub::detail::value_t; - - // The output tuple type - using OutputTupleT = - cub::detail::non_void_value_t>; - - // The output value type - using OutputValueT = typename OutputTupleT::Value; - - // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; - - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // d_out <-- [9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input value type - using InputT = cub::detail::value_t; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // d_out <-- [{6, 9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input type - using InputValueT = cub::detail::value_t; - - // The output tuple type - using OutputTupleT = - cub::detail::non_void_value_t>; - - // The output value type - using OutputValueT = typename OutputTupleT::Value; - - // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; - - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. - * - * \par - * This operation computes segmented reductions within \p d_values_in using - * the specified binary \p reduction_op functor. The segments are identified by - * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of - * consecutive, identical keys. For the ith run encountered, - * the first key of the run and the corresponding value aggregate of that run are - * written to d_unique_out[i] and d_aggregates_out[i], - * respectively. The total number of runs encountered is written to \p d_num_runs_out. - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following chart illustrates reduction-by-key (sum) performance across - * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments - * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. - * - * \image html reduce_by_key_fp32_len_500.png - * \image html reduce_by_key_fp64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html reduce_by_key_fp32_len_5.png - * \image html reduce_by_key_fp64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the segmented reduction of \p int values grouped - * by runs of associated \p int keys. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] - * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_num_runs_out; // e.g., [-] - * CustomMin reduction_op; - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduce-by-key - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_aggregates_out <-- [0, 1, 6, 2, 4] - * // d_num_runs_out <-- [5] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator - * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator - * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename ReductionOpT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t ReduceByKey( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // FlagT iterator type (not used) - - // Selection op (not used) - - // Default == operator - typedef Equality EqualityOp; - - return DispatchReduceByKey::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - EqualityOp(), - reduction_op, - num_items, - stream, - debug_synchronous); - } + /** + * @brief Computes a device-wide reduction using the specified binary + * `reduction_op` functor and initial value `init`. + * + * @par + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates a user-defined min-reduction of a + * device vector of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * __device__ __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for + * // input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * CustomMin min_op; + * int init; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Reduce( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items, min_op, init); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceReduce::Reduce( + * d_temp_storage, temp_storage_bytes, + * d_in, d_out, num_items, min_op, init); + * + * // d_out <-- [0] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam T + * **[inferred]** Data element type that is convertible to the `value` type + * of `InputIteratorT` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param d_in[in] + * Pointer to the input sequence of data items + * + * @param d_out[out] + * Pointer to the output aggregate + * + * @param num_items[in] + * Total number of input items (i.e., length of `d_in`) + * + * @param reduction_op[in] + * Binary reduction functor + * + * @param[in] init + * Initial value of the reduction + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + ReductionOpT reduction_op, + T init, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + init, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide sum using the addition (`+`) operator. + * + * @par + * - Uses `0` as the initial value of the reduction. + * - Does not support \p + operators that are non-commutative.. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Performance + * The following charts illustrate saturated sum-reduction performance across + * different CUDA architectures for `int32` and `int64` items, respectively. + * + * @image html reduce_int32.png + * @image html reduce_int64.png + * + * @par Snippet + * The code snippet below illustrates the sum-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Sum( + * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceReduce::Sum( + * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [38] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + * + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The output value type + using OutputT = + cub::detail::non_void_value_t>; + + return DispatchReduce:: + Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide minimum using the less-than ('<') operator. + * + * @par + * - Uses `std::numeric_limits::max()` as the initial value of the reduction. + * - Does not support `<` operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the min-reduction of a device vector of + * `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Min( + * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceReduce::Min( + * d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [0] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + * + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input value type + using InputT = cub::detail::value_t; + + return DispatchReduce:: + Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Min(), + Traits::Max(), // replace with + // std::numeric_limits::max() when + // C++11 support is more prevalent + stream, + debug_synchronous); + } + + /** + * @brief Finds the first device-wide minimum using the less-than ('<') + * operator, also returning the index of that item. + * + * @par + * - The output value type of `d_out` is cub::KeyValuePair `` + * (assuming the value type of `d_in` is `T`) + * - The minimum is written to `d_out.value` and its offset in the input + * array is written to `d_out.key`. + * - The `{1, std::numeric_limits::max()}` tuple is produced for + * zero-length inputs + * - Does not support `<` operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMin( + * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceReduce::ArgMin( + * d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // d_out <-- [{5, 0}] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items + * (of some type `T`) \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate + * (having value type `cub::KeyValuePair`) \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to \p temp_storage_bytes and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input type + using InputValueT = cub::detail::value_t; + + // The output tuple type + using OutputTupleT = + cub::detail::non_void_value_t>; + + // The output value type + using OutputValueT = typename OutputTupleT::Value; + + // Wrapped input iterator to produce index-value tuples + using ArgIndexInputIteratorT = + ArgIndexInputIterator; + + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with + // std::numeric_limits::max() + // when C++11 + // support is + // more prevalent + + return DispatchReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + /** + * @brief Computes a device-wide maximum using the greater-than ('>') operator. + * + * @par + * - Uses `std::numeric_limits::lowest()` as the initial value of the + * reduction. + * - Does not support `>` operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the max-reduction of a device vector of + * `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Max( + * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceReduce::Max( + * d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // d_out <-- [9] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + * + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input value type + using InputT = cub::detail::value_t; + + return DispatchReduce:: + Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Max(), + Traits::Lowest(), // replace with + // std::numeric_limits::lowest() + // when C++11 support is more prevalent + stream, + debug_synchronous); + } + + /** + * @brief Finds the first device-wide maximum using the greater-than ('>') + * operator, also returning the index of that item + * + * @par + * - The output value type of `d_out` is cub::KeyValuePair `` + * (assuming the value type of `d_in` is `T`) + * - The maximum is written to `d_out.value` and its offset in the input + * array is written to `d_out.key`. + * - The `{1, std::numeric_limits::lowest()}` tuple is produced for + * zero-length inputs + * - Does not support `>` operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMax( + * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceReduce::ArgMax( + * d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // d_out <-- [{6, 9}] + * + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items + * (of some type \p T) \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate + * (having value type `cub::KeyValuePair`) \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_items + * Total number of input items (i.e., length of `d_in`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input type + using InputValueT = cub::detail::value_t; + + // The output tuple type + using OutputTupleT = + cub::detail::non_void_value_t>; + + // The output value type + using OutputValueT = typename OutputTupleT::Value; + + // Wrapped input iterator to produce index-value tuples + using ArgIndexInputIteratorT = + ArgIndexInputIterator; + + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace + // with + // std::numeric_limits::lowest() + // when C++11 + // support is + // more + // prevalent + + return DispatchReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_items, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + + /** + * @brief Reduces segments of values, where segments are demarcated by + * corresponding runs of identical keys. + * + * @par + * This operation computes segmented reductions within `d_values_in` using + * the specified binary `reduction_op` functor. The segments are identified + * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal + * ranges of consecutive, identical keys. For the *i*th run + * encountered, the first key of the run and the corresponding value + * aggregate of that run are written to `d_unique_out[i] and + * `d_aggregates_out[i]`, respectively. The total number of runs encountered + * is written to `d_num_runs_out`. + * + * @par + * - The `==` equality operator is used to determine whether keys are + * equivalent + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - @devicestorage + * + * @par Performance + * The following chart illustrates reduction-by-key (sum) performance across + * different CUDA architectures for `fp32` and `fp64` values, respectively. + * Segments are identified by `int32` keys, and have lengths uniformly + * sampled from `[1, 1000]`. + * + * @image html reduce_by_key_fp32_len_500.png + * @image html reduce_by_key_fp64_len_500.png + * + * @par + * The following charts are similar, but with segment lengths uniformly + * sampled from [1,10]: + * + * @image html reduce_by_key_fp32_len_5.png + * @image html reduce_by_key_fp64_len_5.png + * + * @par Snippet + * The code snippet below illustrates the segmented reduction of `int` values + * grouped by runs of associated `int` keys. + * @par + * @code + * #include + * // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_num_runs_out; // e.g., [-] + * CustomMin reduction_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ReduceByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_unique_out, d_values_in, + * d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduce-by-key + * cub::DeviceReduce::ReduceByKey( + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_unique_out, d_values_in, + * d_aggregates_out, d_num_runs_out, reduction_op, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_aggregates_out <-- [0, 1, 6, 2, 4] + * // d_num_runs_out <-- [5] + * @endcode + * + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * keys \iterator + * + * @tparam UniqueOutputIteratorT + * **[inferred]** Random-access output iterator type for writing unique + * output keys \iterator + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * values \iterator + * + * @tparam AggregatesOutputIterator + * **[inferred]** Random-access output iterator type for writing output + * value aggregates \iterator + * + * @tparam NumRunsOutputIteratorT + * **[inferred]** Output iterator type for recording the number of runs + * encountered \iterator + * + * @tparam ReductionOpT + * **[inferred]*8 Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_keys_in + * Pointer to the input sequence of keys + * + * @param[out] d_unique_out + * Pointer to the output sequence of unique keys (one key per run) + * + * @param[in] d_values_in + * Pointer to the input sequence of corresponding values + * + * @param[out] d_aggregates_out + * Pointer to the output sequence of value aggregates + * (one aggregate per run) + * + * @param[out] d_num_runs_out + * Pointer to total number of runs encountered + * (i.e., the length of `d_unique_out`) + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] num_items + * Total number of associated key+value pairs + * (i.e., the length of `d_in_keys` and `d_in_values`) + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + ReduceByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + ReductionOpT reduction_op, + int num_items, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // FlagT iterator type (not used) + + // Selection op (not used) + + // Default == operator + typedef Equality EqualityOp; + + return DispatchReduceByKey::Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + EqualityOp(), + reduction_op, + num_items, + stream, + debug_synchronous); + } }; /** - * \example example_device_reduce.cu + * @example example_device_reduce.cu */ CUB_NAMESPACE_END - From fd10729453ac4ddf4eee0a626e8f3dd6df04cadc Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 31 May 2022 16:00:29 +0400 Subject: [PATCH 2/4] Document in-place guarantees for reduce --- cub/device/device_reduce.cuh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 98335521da..9378604af1 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -93,6 +93,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet @@ -230,6 +231,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Performance @@ -345,6 +347,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet @@ -459,6 +462,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet @@ -588,6 +592,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet @@ -702,6 +707,7 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`. * - @devicestorage * * @par Snippet @@ -844,6 +850,12 @@ struct DeviceReduce * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. + * - Let `out` be any of + * `[d_unique_out, d_unique_out + *d_num_runs_out)` + * `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)` + * `d_num_runs_out`. The ranges represented by `out` shall not overlap + * `[d_keys_in, d_keys_in + num_items)`, + * `[d_values_in, d_values_in + num_items)` nor `out` in any way. * - @devicestorage * * @par Performance From ecf6e7632b93535040ff9da989e1a84a30e43c06 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 31 May 2022 17:41:56 +0400 Subject: [PATCH 3/4] Reformat segmented reduce docs --- cub/device/device_segmented_reduce.cuh | 1559 +++++++++++++++--------- 1 file changed, 970 insertions(+), 589 deletions(-) diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh index 78a4c4f10c..fd830efea6 100644 --- a/cub/device/device_segmented_reduce.cuh +++ b/cub/device/device_segmented_reduce.cuh @@ -1,7 +1,6 @@ - /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -14,10 +13,10 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -28,606 +27,988 @@ ******************************************************************************/ /** - * \file - * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. + * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations + * for computing a batched reduction across multiple sequences of data + * items residing within device-accessible memory. */ #pragma once -#include #include -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../config.cuh" -#include "../util_type.cuh" +#include +#include +#include +#include +#include CUB_NAMESPACE_BEGIN /** - * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SegmentedModule + * @brief DeviceSegmentedReduce provides device-wide, parallel operations for + * computing a reduction across multiple sequences of data items + * residing within device-accessible memory. ![](reduce_logo.png) + * @ingroup SegmentedModule * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * @par Overview + * A *reduction* + * (or *fold*) uses a binary combining operator to compute a single aggregate + * from a sequence of input elements. * - * \par Usage Considerations - * \cdp_class{DeviceSegmentedReduce} + * @par Usage Considerations + * @cdp_class{DeviceSegmentedReduce} * */ struct DeviceSegmentedReduce { - /** - * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * CustomMin min_op; - * int initial_value; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT, - typename ReductionOp, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOp reduction_op, ///< [in] Binary reduction functor - T initial_value, ///< [in] Initial value of the reduction for each segment - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented sum using the addition ('+') operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p + operators that are non-commutative.. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [21, 0, 17] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - using OutputT = - cub::detail::non_void_value_t>; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input value type - using InputT = cub::detail::value_t; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input type - using InputValueT = cub::detail::value_t; - - // The output tuple type - using OutputTupleT = - cub::detail::non_void_value_t>; - - // The output value type - using OutputValueT = typename OutputTupleT::Value; - - // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; - - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [8, INT_MIN, 9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input value type - using InputT = cub::detail::value_t; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator - * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - using OffsetT = int; - - // The input type - using InputValueT = cub::detail::value_t; - - // The output tuple type - using OutputTupleT = - cub::detail::non_void_value_t>; - - // The output value type - using OutputValueT = typename OutputTupleT::Value; - - // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; - - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - + /** + * @brief Computes a device-wide segmented reduction using the specified + * binary `reduction_op` functor. + * + * @par + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased + * for both the `d_begin_offsets` and `d_end_offsets` parameters (where + * the latter is specified as `segment_offsets + 1`). + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates a custom min-reduction of a device + * vector of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * CustomMin min_op; + * int initial_value; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Reduce( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceSegmentedReduce::Reduce( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // d_out <-- [6, INT_MAX, 0] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @tparam ReductionOp + * **[inferred]** Binary reduction functor type having member + * `T operator()(const T &a, const T &b)` + * + * @tparam T + * **[inferred]** Data element type that is convertible to the `value` type + * of `InputIteratorT` + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no + * work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of \p d_temp_storage allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] initial_value + * Initial value of the reduction for each segment + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + Reduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOp reduction_op, + T initial_value, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + return DispatchSegmentedReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide segmented sum using the addition (`+`) + * operator. + * + * @par + * - Uses `0` as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased + * for both the `d_begin_offsets` and `d_end_offsets` parameters (where + * the latter is specified as `segment_offsets + 1`). + * - Does not support `+` operators that are non-commutative. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the sum reduction of a device vector of + * `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Sum( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceSegmentedReduce::Sum( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [21, 0, 17] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate + * \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] stream + * **[optional] CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + Sum(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The output value type + using OutputT = + cub::detail::non_void_value_t>; + + return DispatchSegmentedReduce< + InputIteratorT, + OutputIteratorT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT, + cub::Sum>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide segmented minimum using the less-than + * (`<`) operator. + * + * @par + * - Uses `std::numeric_limits::max()` as the initial value of the + * reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is + * specified as `segment_offsets + 1`). + * - Does not support `<` operators that are non-commutative. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the min-reduction of a device vector of + * `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Min( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceSegmentedReduce::Min( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [6, INT_MAX, 0] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + Min(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input value type + using InputT = cub::detail::value_t; + + return DispatchSegmentedReduce< + InputIteratorT, + OutputIteratorT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT, + cub::Min>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Min(), + Traits::Max(), // replace with + // std::numeric_limits::max() + // when C++11 support is more + // prevalent + stream, + debug_synchronous); + } + + /** + * @brief Finds the first device-wide minimum in each segment using the + * less-than ('<') operator, also returning the in-segment index of + * that item. + * + * @par + * - The output value type of `d_out` is cub::KeyValuePair `` + * (assuming the value type of `d_in` is `T`) + * - The minimum of the *i*th segment is written to + * `d_out[i].value` and its offset in that segment is written to + * `d_out[i].key`. + * - The `{1, std::numeric_limits::max()}` tuple is produced for + * zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter + * is specified as `segment_offsets + 1`). + * - Does not support `<` operators that are non-commutative. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMin( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceSegmentedReduce::ArgMin( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items + * (of some type `T`) \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate + * (having value type `KeyValuePair`) \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the + * *i*th is considered empty. + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ArgMin(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input type + using InputValueT = cub::detail::value_t; + + // The output tuple type + using OutputTupleT = + cub::detail::non_void_value_t>; + + // The output value type + using OutputValueT = typename OutputTupleT::Value; + + // Wrapped input iterator to produce index-value tuples + using ArgIndexInputIteratorT = + ArgIndexInputIterator; + + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with + // std::numeric_limits::max() + // when C++11 + // support is + // more prevalent + + return DispatchSegmentedReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + /** + * @brief Computes a device-wide segmented maximum using the greater-than + * (`>`) operator. + * + * @par + * - Uses `std::numeric_limits::lowest()` as the initial value of the + * reduction. + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased + * for both the `d_begin_offsets` and `d_end_offsets` parameters (where + * the latter is specified as `segment_offsets + 1`). + * - Does not support `>` operators that are non-commutative. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the max-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Max( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceSegmentedReduce::Max( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [8, INT_MIN, 9] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input + * items \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced + * aggregate \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + Max(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input value type + using InputT = cub::detail::value_t; + + return DispatchSegmentedReduce< + InputIteratorT, + OutputIteratorT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT, + cub::Max>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Max(), + Traits::Lowest(), // replace with + // std::numeric_limits::lowest() + // when C++11 support is + // more prevalent + stream, + debug_synchronous); + } + + /** + * @brief Finds the first device-wide maximum in each segment using the + * greater-than ('>') operator, also returning the in-segment index of + * that item + * + * @par + * - The output value type of `d_out` is cub::KeyValuePair `` + * (assuming the value type of `d_in` is `T`) + * - The maximum of the *i*th segment is written to + * `d_out[i].value` and its offset in that segment is written to + * `d_out[i].key`. + * - The `{1, std::numeric_limits::lowest()}` tuple is produced for + * zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * `segment_offsets` (of length `num_segments + 1`) can be aliased + * for both the `d_begin_offsets` and `d_end_offsets` parameters (where + * the latter is specified as `segment_offsets + 1`). + * - Does not support `>` operators that are non-commutative. + * - @devicestorage + * + * @par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector + * of `int` data elements. + * @par + * @code + * #include + * // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers + * // for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMax( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceSegmentedReduce::ArgMax( + * d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + * @endcode + * + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items + * (of some type `T`) \iterator + * + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate + * (having value type `KeyValuePair`) \iterator + * + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment + * ending offsets \iterator + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work + * is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of `d_temp_storage` allocation + * + * @param[in] d_in + * Pointer to the input sequence of data items + * + * @param[out] d_out + * Pointer to the output aggregate + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and + * `d_values_*` + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * considered empty. + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. Also causes launch configurations to + * be printed to the console. Default is `false`. + */ + template + CUB_RUNTIME_FUNCTION static cudaError_t + ArgMax(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0, + bool debug_synchronous = false) + { + // Signed integer type for global offsets + using OffsetT = int; + + // The input type + using InputValueT = cub::detail::value_t; + + // The output tuple type + using OutputTupleT = + cub::detail::non_void_value_t>; + + // The output value type + using OutputValueT = typename OutputTupleT::Value; + + // Wrapped input iterator to produce index-value tuples + using ArgIndexInputIteratorT = + ArgIndexInputIterator; + + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace + // with + // std::numeric_limits::lowest() + // when C++11 + // support is + // more + // prevalent + + return DispatchSegmentedReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } }; CUB_NAMESPACE_END From 5732130c783784da755d3c88cccce74c4da4d502 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 31 May 2022 17:58:45 +0400 Subject: [PATCH 4/4] Add in-place guarantees for segmented reduce --- cub/device/device_segmented_reduce.cuh | 43 ++++++++++++++++++++------ 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh index fd830efea6..3d16f30fb9 100644 --- a/cub/device/device_segmented_reduce.cuh +++ b/cub/device/device_segmented_reduce.cuh @@ -77,6 +77,11 @@ struct DeviceSegmentedReduce * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -245,6 +250,11 @@ struct DeviceSegmentedReduce * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `+` operators that are non-commutative. + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -390,6 +400,11 @@ struct DeviceSegmentedReduce * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -542,6 +557,11 @@ struct DeviceSegmentedReduce * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -705,6 +725,11 @@ struct DeviceSegmentedReduce * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -845,7 +870,7 @@ struct DeviceSegmentedReduce * that item * * @par - * - The output value type of `d_out` is cub::KeyValuePair `` + * - The output value type of `d_out` is `cub::KeyValuePair` * (assuming the value type of `d_in` is `T`) * - The maximum of the *i*th segment is written to * `d_out[i].value` and its offset in that segment is written to @@ -857,6 +882,11 @@ struct DeviceSegmentedReduce * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, + * `[d_begin_offsets, d_begin_offsets + num_segments)` nor + * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet @@ -983,14 +1013,9 @@ struct DeviceSegmentedReduce ArgIndexInputIteratorT d_indexed_in(d_in); - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace - // with - // std::numeric_limits::lowest() - // when C++11 - // support is - // more - // prevalent + // Initial value, replace with std::numeric_limits::lowest() when C++11 + // support is more prevalent + OutputTupleT initial_value(1, Traits::Lowest()); return DispatchSegmentedReduce