From cedaee6a979240a1ee58a3d92e2c5cdfab163329 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 31 May 2022 15:38:55 +0400
Subject: [PATCH 1/4] Reformat reduce docs

---
 cub/device/device_reduce.cuh | 1652 ++++++++++++++++++++--------------
 1 file changed, 978 insertions(+), 674 deletions(-)
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 9f70a111a4..98335521da 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,706 +27,1011 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ * @file cub::DeviceReduce provides device-wide, parallel operations for 
+ *       computing a reduction across a sequence of data items residing within 
+ *       device-accessible memory.
  */
 
 #pragma once
 
-#include <stdio.h>
 #include <iterator>
 #include <limits>
 
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../config.cuh"
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_reduce.cuh>
+#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SingleModule
+ * @brief DeviceReduce provides device-wide, parallel operations for computing 
+ *        a reduction across a sequence of data items residing within 
+ *        device-accessible memory. ![](reduce_logo.png)
+ * @ingroup SingleModule
  *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ * @par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a> 
+ * (or *fold*) uses a binary combining operator to compute a single aggregate 
+ * from a sequence of input elements.
  *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
+ * @par Usage Considerations
+ * @cdp_class{DeviceReduce}
  *
- * \par Performance
- * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ * @par Performance
+ * @linear_performance{reduction, reduce-by-key, and run-length encode}
  *
- * \par
+ * @par
  * The following chart illustrates DeviceReduce::Sum
  * performance across different CUDA architectures for \p int32 keys.
  *
- * \image html reduce_int32.png
+ * @image html reduce_int32.png
  *
- * \par
+ * @par
  * The following chart illustrates DeviceReduce::ReduceByKey (summation)
- * performance across different CUDA architectures for \p fp32
- * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ * performance across different CUDA architectures for `fp32` values. Segments 
+ * are identified by `int32` keys, and have lengths uniformly sampled 
+ * from `[1, 1000]`.
  *
- * \image html reduce_by_key_fp32_len_500.png
+ * @image html reduce_by_key_fp32_len_500.png
  *
- * \par
- * \plots_below
+ * @par
+ * @plots_below
  *
  */
 struct DeviceReduce
 {
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     __device__ __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;  // e.g., 7
-     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;     // e.g., [-]
-     * CustomMin    min_op;
-     * int          init;       // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    ReductionOpT,
-        typename                    T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
-        T                           init,                               ///< [in] Initial value of the reduction
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op,
-            init,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition (\p +) operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction.
-     * - Does not support \p + operators that are non-commutative..
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sum-reduction performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html reduce_int32.png
-     * \image html reduce_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The output value type
-        using OutputT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        cub::detail::value_t<InputIteratorT>>;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input value type
-        using InputT = cub::detail::value_t<InputIteratorT>;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Min(),
-            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // d_out <-- [{5, 0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input type
-        using InputValueT = cub::detail::value_t<InputIteratorT>;
-
-        // The output tuple type
-        using OutputTupleT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        KeyValuePair<OffsetT, InputValueT>>;
-
-        // The output value type
-        using OutputValueT = typename OutputTupleT::Value;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        using ArgIndexInputIteratorT =
-          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
-
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // d_out <-- [9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input value type
-        using InputT = cub::detail::value_t<InputIteratorT>;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // d_out <-- [{6, 9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                        *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input type
-        using InputValueT = cub::detail::value_t<InputIteratorT>;
-
-        // The output tuple type
-        using OutputTupleT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        KeyValuePair<OffsetT, InputValueT>>;
-
-        // The output value type
-        using OutputValueT = typename OutputTupleT::Value;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        using ArgIndexInputIteratorT =
-          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
-
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
-     *
-     * \par
-     * This operation computes segmented reductions within \p d_values_in using
-     * the specified binary \p reduction_op functor.  The segments are identified by
-     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
-     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
-     * the first key of the run and the corresponding value aggregate of that run are
-     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
-     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following chart illustrates reduction-by-key (sum) performance across
-     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
-     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
-     *
-     * \image html reduce_by_key_fp32_len_500.png
-     * \image html reduce_by_key_fp64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html reduce_by_key_fp32_len_5.png
-     * \image html reduce_by_key_fp64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the segmented reduction of \p int values grouped
-     * by runs of associated \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_num_runs_out;    // e.g., [-]
-     * CustomMin    reduction_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduce-by-key
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
-     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     */
-    template <
-        typename                    KeysInputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    ValuesInputIteratorT,
-        typename                    AggregatesOutputIteratorT,
-        typename                    NumRunsOutputIteratorT,
-        typename                    ReductionOpT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t ReduceByKey(
-        void                        *d_temp_storage,                ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // FlagT iterator type (not used)
-
-        // Selection op (not used)
-
-        // Default == operator
-        typedef Equality EqualityOp;
-
-        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_unique_out,
-            d_values_in,
-            d_aggregates_out,
-            d_num_runs_out,
-            EqualityOp(),
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
+  /**
+   * @brief Computes a device-wide reduction using the specified binary 
+   *        `reduction_op` functor and initial value `init`.
+   *
+   * @par
+   * - Does not support binary reduction operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates a user-defined min-reduction of a 
+   * device vector of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     __device__ __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // input and output
+   * int          num_items;  // e.g., 7
+   * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_out;     // e.g., [-]
+   * CustomMin    min_op;
+   * int          init;       // e.g., INT_MAX
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::Reduce(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items, min_op, init);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run reduction
+   * cub::DeviceReduce::Reduce(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_in, d_out, num_items, min_op, init);
+   *
+   * // d_out <-- [0]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @tparam ReductionOpT         
+   *   **[inferred]** Binary reduction functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @tparam T                    
+   *   **[inferred]** Data element type that is convertible to the `value` type 
+   *   of `InputIteratorT`
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_in[in] 
+   *   Pointer to the input sequence of data items
+   *
+   * @param d_out[out] 
+   *   Pointer to the output aggregate
+   *
+   * @param num_items[in] 
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param reduction_op[in] 
+   *   Binary reduction functor
+   *
+   * @param[in] init  
+   *   Initial value of the reduction
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ReductionOpT,
+            typename T>
+  CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_items,
+                                                 ReductionOpT reduction_op,
+                                                 T init,
+                                                 cudaStream_t stream    = 0,
+                                                 bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchReduce<InputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
+                          ReductionOpT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_in,
+                                                  d_out,
+                                                  num_items,
+                                                  reduction_op,
+                                                  init,
+                                                  stream,
+                                                  debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide sum using the addition (`+`) operator.
+   *
+   * @par
+   * - Uses `0` as the initial value of the reduction.
+   * - Does not support \p + operators that are non-commutative..
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated sum-reduction performance across 
+   * different CUDA architectures for `int32` and `int64` items, respectively.
+   *
+   * @image html reduce_int32.png
+   * @image html reduce_int64.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sum-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_out;         // e.g., [-]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::Sum(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sum-reduction
+   * cub::DeviceReduce::Sum(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+   *
+   * // d_out <-- [38]
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   *
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_items,
+                                              cudaStream_t stream    = 0,
+                                              bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The output value type
+    using OutputT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    cub::detail::value_t<InputIteratorT>>;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::
+      Dispatch(d_temp_storage,
+               temp_storage_bytes,
+               d_in,
+               d_out,
+               num_items,
+               cub::Sum(),
+               OutputT(), // zero-initialize
+               stream,
+               debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide minimum using the less-than ('<') operator.
+   *
+   * @par
+   * - Uses `std::numeric_limits<T>::max()` as the initial value of the reduction.
+   * - Does not support `<` operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the min-reduction of a device vector of 
+   * `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_out;         // e.g., [-]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::Min(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run min-reduction
+   * cub::DeviceReduce::Min(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+   *
+   * // d_out <-- [0]
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   *
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_items,
+                                              cudaStream_t stream    = 0,
+                                              bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::
+      Dispatch(d_temp_storage,
+               temp_storage_bytes,
+               d_in,
+               d_out,
+               num_items,
+               cub::Min(),
+               Traits<InputT>::Max(), // replace with
+                                      // std::numeric_limits<T>::max() when
+                                      // C++11 support is more prevalent
+               stream,
+               debug_synchronous);
+  }
+
+  /**
+   * @brief Finds the first device-wide minimum using the less-than ('<') 
+   *        operator, also returning the index of that item.
+   *
+   * @par
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   *   (assuming the value type of `d_in` is `T`)
+   *   - The minimum is written to `d_out.value` and its offset in the input 
+   *     array is written to `d_out.key`.
+   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for 
+   *     zero-length inputs
+   * - Does not support `<` operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the argmin-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int                      num_items;      // e.g., 7
+   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::ArgMin(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run argmin-reduction
+   * cub::DeviceReduce::ArgMin(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+   *
+   * // d_out <-- [{5, 0}]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input items 
+   *   (of some type `T`) \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   *   (having value type `cub::KeyValuePair<int, T>`) \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to \p temp_storage_bytes and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_items,
+                                                 cudaStream_t stream    = 0,
+                                                 bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    KeyValuePair<OffsetT, InputValueT>>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT =
+      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with
+                                                               // std::numeric_limits<T>::max()
+                                                               // when C++11
+                                                               // support is
+                                                               // more prevalent
+
+    return DispatchReduce<ArgIndexInputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
+                          cub::ArgMin>::Dispatch(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 d_indexed_in,
+                                                 d_out,
+                                                 num_items,
+                                                 cub::ArgMin(),
+                                                 initial_value,
+                                                 stream,
+                                                 debug_synchronous);
+  }
 
+  /**
+   * @brief Computes a device-wide maximum using the greater-than ('>') operator.
+   *
+   * @par
+   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the 
+   *   reduction.
+   * - Does not support `>` operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the max-reduction of a device vector of 
+   * `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int  num_items;      // e.g., 7
+   * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_out;         // e.g., [-]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::Max(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run max-reduction
+   * cub::DeviceReduce::Max(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+   *
+   * // d_out <-- [9]
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   *
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
+                                              size_t &temp_storage_bytes,
+                                              InputIteratorT d_in,
+                                              OutputIteratorT d_out,
+                                              int num_items,
+                                              cudaStream_t stream    = 0,
+                                              bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::
+      Dispatch(d_temp_storage,
+               temp_storage_bytes,
+               d_in,
+               d_out,
+               num_items,
+               cub::Max(),
+               Traits<InputT>::Lowest(), // replace with
+                                         // std::numeric_limits<T>::lowest()
+                                         // when C++11 support is more prevalent
+               stream,
+               debug_synchronous);
+  }
+
+  /**
+   * @brief Finds the first device-wide maximum using the greater-than ('>') 
+   *        operator, also returning the index of that item
+   *
+   * @par
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   *   (assuming the value type of `d_in` is `T`)
+   *   - The maximum is written to `d_out.value` and its offset in the input 
+   *     array is written to `d_out.key`.
+   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for 
+   *     zero-length inputs
+   * - Does not support `>` operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the argmax-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_reduce.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int                      num_items;      // e.g., 7
+   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::ArgMax(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run argmax-reduction
+   * cub::DeviceReduce::ArgMax(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+   *
+   * // d_out <-- [{6, 9}]
+   *
+   * @endcode
+   *
+   * @tparam InputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input items 
+   *   (of some type \p T) \iterator
+   *
+   * @tparam OutputIteratorT    
+   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   *   (having value type `cub::KeyValuePair<int, T>`) \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items  
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
+                                                 size_t &temp_storage_bytes,
+                                                 InputIteratorT d_in,
+                                                 OutputIteratorT d_out,
+                                                 int num_items,
+                                                 cudaStream_t stream    = 0,
+                                                 bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    KeyValuePair<OffsetT, InputValueT>>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT =
+      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace
+                                                                  // with
+                                                                  // std::numeric_limits<T>::lowest()
+                                                                  // when C++11
+                                                                  // support is
+                                                                  // more
+                                                                  // prevalent
+
+    return DispatchReduce<ArgIndexInputIteratorT,
+                          OutputIteratorT,
+                          OffsetT,
+                          cub::ArgMax>::Dispatch(d_temp_storage,
+                                                 temp_storage_bytes,
+                                                 d_indexed_in,
+                                                 d_out,
+                                                 num_items,
+                                                 cub::ArgMax(),
+                                                 initial_value,
+                                                 stream,
+                                                 debug_synchronous);
+  }
+
+  /**
+   * @brief Reduces segments of values, where segments are demarcated by 
+   *        corresponding runs of identical keys.
+   *
+   * @par
+   * This operation computes segmented reductions within `d_values_in` using
+   * the specified binary `reduction_op` functor. The segments are identified 
+   * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal 
+   * ranges of consecutive, identical keys. For the *i*<sup>th</sup> run 
+   * encountered, the first key of the run and the corresponding value 
+   * aggregate of that run are written to `d_unique_out[i] and 
+   * `d_aggregates_out[i]`, respectively. The total number of runs encountered 
+   * is written to `d_num_runs_out`.
+   *
+   * @par
+   * - The `==` equality operator is used to determine whether keys are 
+   *   equivalent
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following chart illustrates reduction-by-key (sum) performance across
+   * different CUDA architectures for `fp32` and `fp64` values, respectively.  
+   * Segments are identified by `int32` keys, and have lengths uniformly 
+   * sampled from `[1, 1000]`.
+   *
+   * @image html reduce_by_key_fp32_len_500.png
+   * @image html reduce_by_key_fp64_len_500.png
+   *
+   * @par
+   * The following charts are similar, but with segment lengths uniformly 
+   * sampled from [1,10]:
+   *
+   * @image html reduce_by_key_fp32_len_5.png
+   * @image html reduce_by_key_fp64_len_5.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the segmented reduction of `int` values 
+   * grouped by runs of associated `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_reduce.cuh>
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int          num_items;          // e.g., 8
+   * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+   * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+   * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+   * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+   * int          *d_num_runs_out;    // e.g., [-]
+   * CustomMin    reduction_op;
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceReduce::ReduceByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_unique_out, d_values_in, 
+   *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run reduce-by-key
+   * cub::DeviceReduce::ReduceByKey(
+   *   d_temp_storage, temp_storage_bytes, 
+   *   d_keys_in, d_unique_out, d_values_in, 
+   *   d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+   *
+   * // d_unique_out      <-- [0, 2, 9, 5, 8]
+   * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+   * // d_num_runs_out    <-- [5]
+   * @endcode
+   *
+   * @tparam KeysInputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   keys \iterator
+   *
+   * @tparam UniqueOutputIteratorT    
+   *   **[inferred]** Random-access output iterator type for writing unique 
+   *   output keys \iterator
+   *
+   * @tparam ValuesInputIteratorT     
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   values \iterator
+   *
+   * @tparam AggregatesOutputIterator 
+   *   **[inferred]** Random-access output iterator type for writing output 
+   *   value aggregates \iterator
+   *
+   * @tparam NumRunsOutputIteratorT   
+   *   **[inferred]** Output iterator type for recording the number of runs 
+   *   encountered \iterator
+   *
+   * @tparam ReductionOpT              
+   *   **[inferred]*8 Binary reduction functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in  
+   *   Pointer to the input sequence of keys
+   *
+   * @param[out] d_unique_out  
+   *   Pointer to the output sequence of unique keys (one key per run)
+   *
+   * @param[in] d_values_in  
+   *   Pointer to the input sequence of corresponding values
+   *
+   * @param[out] d_aggregates_out  
+   *   Pointer to the output sequence of value aggregates 
+   *   (one aggregate per run)
+   *
+   * @param[out] d_num_runs_out  
+   *   Pointer to total number of runs encountered 
+   *   (i.e., the length of `d_unique_out`)
+   *
+   * @param[in] reduction_op  
+   *   Binary reduction functor
+   *
+   * @param[in] num_items  
+   *   Total number of associated key+value pairs 
+   *   (i.e., the length of `d_in_keys` and `d_in_values`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. May cause significant slowdown.  
+   *   Default is `false`.
+   */
+  template <typename KeysInputIteratorT,
+            typename UniqueOutputIteratorT,
+            typename ValuesInputIteratorT,
+            typename AggregatesOutputIteratorT,
+            typename NumRunsOutputIteratorT,
+            typename ReductionOpT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+  ReduceByKey(void *d_temp_storage,
+              size_t &temp_storage_bytes,
+              KeysInputIteratorT d_keys_in,
+              UniqueOutputIteratorT d_unique_out,
+              ValuesInputIteratorT d_values_in,
+              AggregatesOutputIteratorT d_aggregates_out,
+              NumRunsOutputIteratorT d_num_runs_out,
+              ReductionOpT reduction_op,
+              int num_items,
+              cudaStream_t stream    = 0,
+              bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // FlagT iterator type (not used)
+
+    // Selection op (not used)
+
+    // Default == operator
+    typedef Equality EqualityOp;
+
+    return DispatchReduceByKey<KeysInputIteratorT,
+                               UniqueOutputIteratorT,
+                               ValuesInputIteratorT,
+                               AggregatesOutputIteratorT,
+                               NumRunsOutputIteratorT,
+                               EqualityOp,
+                               ReductionOpT,
+                               OffsetT>::Dispatch(d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  d_keys_in,
+                                                  d_unique_out,
+                                                  d_values_in,
+                                                  d_aggregates_out,
+                                                  d_num_runs_out,
+                                                  EqualityOp(),
+                                                  reduction_op,
+                                                  num_items,
+                                                  stream,
+                                                  debug_synchronous);
+  }
 };
 
 /**
- * \example example_device_reduce.cu
+ * @example example_device_reduce.cu
  */
 
 CUB_NAMESPACE_END
 
-

From fd10729453ac4ddf4eee0a626e8f3dd6df04cadc Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 31 May 2022 16:00:29 +0400
Subject: [PATCH 2/4] Document in-place guarantees for reduce

---
 cub/device/device_reduce.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 98335521da..9378604af1 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -93,6 +93,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Snippet
@@ -230,6 +231,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Performance
@@ -345,6 +347,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Snippet
@@ -459,6 +462,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Snippet
@@ -588,6 +592,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Snippet
@@ -702,6 +707,7 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - The range `[d_in, d_in + num_items)` shall not overlap `d_out`.
    * - @devicestorage
    *
    * @par Snippet
@@ -844,6 +850,12 @@ struct DeviceReduce
    *   However, results for pseudo-associative reduction may be inconsistent
    *   from one device to a another device of a different compute-capability
    *   because CUB can employ different tile-sizing for different architectures.
+   * - Let `out` be any of 
+   *   `[d_unique_out, d_unique_out + *d_num_runs_out)`
+   *   `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)`
+   *   `d_num_runs_out`. The ranges represented by `out` shall not overlap 
+   *   `[d_keys_in, d_keys_in + num_items)`,
+   *   `[d_values_in, d_values_in + num_items)` nor `out` in any way.
    * - @devicestorage
    *
    * @par Performance

From ecf6e7632b93535040ff9da989e1a84a30e43c06 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 31 May 2022 17:41:56 +0400
Subject: [PATCH 3/4] Reformat segmented reduce docs

---
 cub/device/device_segmented_reduce.cuh | 1559 +++++++++++++++---------
 1 file changed, 970 insertions(+), 589 deletions(-)

diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh
index 78a4c4f10c..fd830efea6 100644
--- a/cub/device/device_segmented_reduce.cuh
+++ b/cub/device/device_segmented_reduce.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,606 +27,988 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations 
+ *       for computing a batched reduction across multiple sequences of data 
+ *       items residing within device-accessible memory.
  */
 
 #pragma once
 
-#include <stdio.h>
 #include <iterator>
 
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_reduce.cuh>
+#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/util_type.cuh>
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SegmentedModule
+ * @brief DeviceSegmentedReduce provides device-wide, parallel operations for 
+ *        computing a reduction across multiple sequences of data items 
+ *        residing within device-accessible memory. ![](reduce_logo.png)
+ * @ingroup SegmentedModule
  *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ * @par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)">*reduction*</a> 
+ * (or *fold*) uses a binary combining operator to compute a single aggregate 
+ * from a sequence of input elements.
  *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedReduce}
+ * @par Usage Considerations
+ * @cdp_class{DeviceSegmentedReduce}
  *
  */
 struct DeviceSegmentedReduce
 {
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_segments;   // e.g., 3
-     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [-, -, -]
-     * CustomMin    min_op;
-     * int          initial_value;           // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT,
-        typename            ReductionOp,
-        typename            T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
-        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOp          reduction_op,                       ///< [in] Binary reduction functor 
-        T                    initial_value,                      ///< [in] Initial value of the reduction for each segment
-        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            reduction_op,
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p + operators that are non-commutative..
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [21, 0, 17]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                  *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT        d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT       d_out,                              ///< [out] Pointer to the output aggregate
-        int                   num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT  d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT    d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t          stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        using OutputT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        cub::detail::value_t<InputIteratorT>>;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                  *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT        d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT       d_out,                              ///< [out] Pointer to the output aggregate
-        int                   num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT  d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT    d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t          stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                  debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input value type
-        using InputT = cub::detail::value_t<InputIteratorT>;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Min(),
-            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
-        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input type
-        using InputValueT = cub::detail::value_t<InputIteratorT>;
-
-        // The output tuple type
-        using OutputTupleT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        KeyValuePair<OffsetT, InputValueT>>;
-
-        // The output value type
-        using OutputValueT = typename OutputTupleT::Value;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        using ArgIndexInputIteratorT =
-          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
-
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [8, INT_MIN, 9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
-        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input value type
-        using InputT = cub::detail::value_t<InputIteratorT>;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam BeginOffsetIteratorT <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT   <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                 *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t               &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT      d_out,                              ///< [out] Pointer to the output aggregate
-        int                  num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                    ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT   d_end_offsets,                      ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t         stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                 debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        using OffsetT = int;
-
-        // The input type
-        using InputValueT = cub::detail::value_t<InputIteratorT>;
-
-        // The output tuple type
-        using OutputTupleT =
-          cub::detail::non_void_value_t<OutputIteratorT,
-                                        KeyValuePair<OffsetT, InputValueT>>;
-
-        // The output value type
-        using OutputValueT = typename OutputTupleT::Value;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        using ArgIndexInputIteratorT =
-          ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
-
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
+  /**
+   * @brief Computes a device-wide segmented reduction using the specified 
+   *        binary `reduction_op` functor.
+   *
+   * @par
+   * - Does not support binary reduction operators that are non-commutative.
+   * - Provides "run-to-run" determinism for pseudo-associative reduction
+   *   (e.g., addition of floating point types) on the same GPU device.
+   *   However, results for pseudo-associative reduction may be inconsistent
+   *   from one device to a another device of a different compute-capability
+   *   because CUB can employ different tile-sizing for different architectures.
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates a custom min-reduction of a device 
+   * vector of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // CustomMin functor
+   * struct CustomMin
+   * {
+   *     template <typename T>
+   *     CUB_RUNTIME_FUNCTION __forceinline__
+   *     T operator()(const T &a, const T &b) const {
+   *         return (b < a) ? b : a;
+   *     }
+   * };
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int          num_segments;   // e.g., 3
+   * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int          *d_out;         // e.g., [-, -, -]
+   * CustomMin    min_op;
+   * int          initial_value;           // e.g., INT_MAX
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::Reduce(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run reduction
+   * cub::DeviceSegmentedReduce::Reduce(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+   *
+   * // d_out <-- [6, INT_MAX, 0]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @tparam BeginOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @tparam ReductionOp          
+   *   **[inferred]** Binary reduction functor type having member 
+   *   `T operator()(const T &a, const T &b)`
+   *
+   * @tparam T                    
+   *   **[inferred]** Data element type that is convertible to the `value` type 
+   *   of `InputIteratorT`
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no 
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] reduction_op  
+   *   Binary reduction functor 
+   *
+   * @param[in] initial_value  
+   *   Initial value of the reduction for each segment
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename ReductionOp,
+            typename T>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Reduce(void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         int num_segments,
+         BeginOffsetIteratorT d_begin_offsets,
+         EndOffsetIteratorT d_end_offsets,
+         ReductionOp reduction_op,
+         T initial_value,
+         cudaStream_t stream    = 0,
+         bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchSegmentedReduce<InputIteratorT,
+                                   OutputIteratorT,
+                                   BeginOffsetIteratorT,
+                                   EndOffsetIteratorT,
+                                   OffsetT,
+                                   ReductionOp>::Dispatch(d_temp_storage,
+                                                          temp_storage_bytes,
+                                                          d_in,
+                                                          d_out,
+                                                          num_segments,
+                                                          d_begin_offsets,
+                                                          d_end_offsets,
+                                                          reduction_op,
+                                                          initial_value,
+                                                          stream,
+                                                          debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide segmented sum using the addition (`+`) 
+   *        operator.
+   *
+   * @par
+   * - Uses `0` as the initial value of the reduction for each segment.
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - Does not support `+` operators that are non-commutative.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sum reduction of a device vector of 
+   * `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int num_segments;   // e.g., 3
+   * int *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int *d_out;         // e.g., [-, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::Sum(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sum-reduction
+   * cub::DeviceSegmentedReduce::Sum(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_out <-- [21, 0, 17]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   *   \iterator
+   *
+   * @tparam BeginOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] stream
+   *   **[optional]</b> CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous                   
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(void *d_temp_storage,
+      size_t &temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream    = 0,
+      bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The output value type
+    using OutputT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    cub::detail::value_t<InputIteratorT>>;
+
+    return DispatchSegmentedReduce<
+      InputIteratorT,
+      OutputIteratorT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT,
+      cub::Sum>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          cub::Sum(),
+                          OutputT(), // zero-initialize
+                          stream,
+                          debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide segmented minimum using the less-than 
+   *        (`<`) operator.
+   *
+   * @par
+   * - Uses `std::numeric_limits<T>::max()` as the initial value of the 
+   *   reduction for each segment.
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   specified as `segment_offsets + 1`).
+   * - Does not support `<` operators that are non-commutative.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the min-reduction of a device vector of 
+   * `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int num_segments;   // e.g., 3
+   * int *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int *d_out;         // e.g., [-, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::Min(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run min-reduction
+   * cub::DeviceSegmentedReduce::Min(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_out <-- [6, INT_MAX, 0]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @tparam BeginOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Min(void *d_temp_storage,
+      size_t &temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream    = 0,
+      bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    return DispatchSegmentedReduce<
+      InputIteratorT,
+      OutputIteratorT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT,
+      cub::Min>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          cub::Min(),
+                          Traits<InputT>::Max(), // replace with
+                                                 // std::numeric_limits<T>::max()
+                                                 // when C++11 support is more
+                                                 // prevalent
+                          stream,
+                          debug_synchronous);
+  }
+
+  /**
+   * @brief Finds the first device-wide minimum in each segment using the 
+   *        less-than ('<') operator, also returning the in-segment index of 
+   *        that item.
+   *
+   * @par
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   *   (assuming the value type of `d_in` is `T`)
+   *   - The minimum of the *i*<sup>th</sup> segment is written to 
+   *     `d_out[i].value` and its offset in that segment is written to 
+   *     `d_out[i].key`.
+   *   - The `{1, std::numeric_limits<T>::max()}` tuple is produced for 
+   *     zero-length inputs
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   is specified as `segment_offsets + 1`).
+   * - Does not support `<` operators that are non-commutative.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the argmin-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int                      num_segments;   // e.g., 3
+   * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::ArgMin(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run argmin-reduction
+   * cub::DeviceSegmentedReduce::ArgMin(
+   *   d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *   num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input items 
+   *   (of some type `T`) \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   *   (having value type `KeyValuePair<int, T>`) \iterator
+   *
+   * @tparam BeginOffsetIteratorT
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the 
+   *   *i*<sup>th</sup> is considered empty.
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ArgMin(void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         int num_segments,
+         BeginOffsetIteratorT d_begin_offsets,
+         EndOffsetIteratorT d_end_offsets,
+         cudaStream_t stream    = 0,
+         bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    KeyValuePair<OffsetT, InputValueT>>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT =
+      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    OutputTupleT initial_value(1, Traits<InputValueT>::Max()); // replace with
+                                                               // std::numeric_limits<T>::max()
+                                                               // when C++11
+                                                               // support is
+                                                               // more prevalent
+
+    return DispatchSegmentedReduce<ArgIndexInputIteratorT,
+                                   OutputIteratorT,
+                                   BeginOffsetIteratorT,
+                                   EndOffsetIteratorT,
+                                   OffsetT,
+                                   cub::ArgMin>::Dispatch(d_temp_storage,
+                                                          temp_storage_bytes,
+                                                          d_indexed_in,
+                                                          d_out,
+                                                          num_segments,
+                                                          d_begin_offsets,
+                                                          d_end_offsets,
+                                                          cub::ArgMin(),
+                                                          initial_value,
+                                                          stream,
+                                                          debug_synchronous);
+  }
+
+  /**
+   * @brief Computes a device-wide segmented maximum using the greater-than 
+   *        (`>`) operator.
+   *
+   * @par
+   * - Uses `std::numeric_limits<T>::lowest()` as the initial value of the 
+   *   reduction.
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - Does not support `>` operators that are non-commutative.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the max-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int num_segments;   // e.g., 3
+   * int *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int *d_out;         // e.g., [-, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::Max(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run max-reduction
+   * cub::DeviceSegmentedReduce::Max(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_out <-- [8, INT_MIN, 9]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input 
+   *   items \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced 
+   *   aggregate \iterator
+   *
+   * @tparam BeginOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Max(void *d_temp_storage,
+      size_t &temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream    = 0,
+      bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    return DispatchSegmentedReduce<
+      InputIteratorT,
+      OutputIteratorT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT,
+      cub::Max>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_segments,
+                          d_begin_offsets,
+                          d_end_offsets,
+                          cub::Max(),
+                          Traits<InputT>::Lowest(), // replace with
+                                                    // std::numeric_limits<T>::lowest()
+                                                    // when C++11 support is
+                                                    // more prevalent
+                          stream,
+                          debug_synchronous);
+  }
+
+  /**
+   * @brief Finds the first device-wide maximum in each segment using the 
+   *        greater-than ('>') operator, also returning the in-segment index of 
+   *        that item
+   *
+   * @par
+   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   *   (assuming the value type of `d_in` is `T`)
+   *   - The maximum of the *i*<sup>th</sup> segment is written to 
+   *     `d_out[i].value` and its offset in that segment is written to 
+   *     `d_out[i].key`.
+   *   - The `{1, std::numeric_limits<T>::lowest()}` tuple is produced for 
+   *     zero-length inputs
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - Does not support `>` operators that are non-commutative.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the argmax-reduction of a device vector 
+   * of `int` data elements.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_reduce.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for input and output
+   * int                      num_segments;   // e.g., 3
+   * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+   * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedReduce::ArgMax(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run argmax-reduction
+   * cub::DeviceSegmentedReduce::ArgMax(
+   *     d_temp_storage, temp_storage_bytes, d_in, d_out,
+   *     num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+   * @endcode
+   *
+   * @tparam InputIteratorT       
+   *   **[inferred]** Random-access input iterator type for reading input items 
+   *   (of some type `T`) \iterator
+   *
+   * @tparam OutputIteratorT      
+   *   **[inferred]** Output iterator type for recording the reduced aggregate 
+   *   (having value type `KeyValuePair<int, T>`) \iterator
+   *
+   * @tparam BeginOffsetIteratorT 
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT   
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in  
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out  
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ArgMax(void *d_temp_storage,
+         size_t &temp_storage_bytes,
+         InputIteratorT d_in,
+         OutputIteratorT d_out,
+         int num_segments,
+         BeginOffsetIteratorT d_begin_offsets,
+         EndOffsetIteratorT d_end_offsets,
+         cudaStream_t stream    = 0,
+         bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT =
+      cub::detail::non_void_value_t<OutputIteratorT,
+                                    KeyValuePair<OffsetT, InputValueT>>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT =
+      ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace
+                                                                  // with
+                                                                  // std::numeric_limits<T>::lowest()
+                                                                  // when C++11
+                                                                  // support is
+                                                                  // more
+                                                                  // prevalent
+
+    return DispatchSegmentedReduce<ArgIndexInputIteratorT,
+                                   OutputIteratorT,
+                                   BeginOffsetIteratorT,
+                                   EndOffsetIteratorT,
+                                   OffsetT,
+                                   cub::ArgMax>::Dispatch(d_temp_storage,
+                                                          temp_storage_bytes,
+                                                          d_indexed_in,
+                                                          d_out,
+                                                          num_segments,
+                                                          d_begin_offsets,
+                                                          d_end_offsets,
+                                                          cub::ArgMax(),
+                                                          initial_value,
+                                                          stream,
+                                                          debug_synchronous);
+  }
 };
 
 CUB_NAMESPACE_END

From 5732130c783784da755d3c88cccce74c4da4d502 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 31 May 2022 17:58:45 +0400
Subject: [PATCH 4/4] Add in-place guarantees for segmented reduce

---
 cub/device/device_segmented_reduce.cuh | 43 ++++++++++++++++++++------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/cub/device/device_segmented_reduce.cuh b/cub/device/device_segmented_reduce.cuh
index fd830efea6..3d16f30fb9 100644
--- a/cub/device/device_segmented_reduce.cuh
+++ b/cub/device/device_segmented_reduce.cuh
@@ -77,6 +77,11 @@ struct DeviceSegmentedReduce
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -245,6 +250,11 @@ struct DeviceSegmentedReduce
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `+` operators that are non-commutative.
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -390,6 +400,11 @@ struct DeviceSegmentedReduce
    *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
    *   specified as `segment_offsets + 1`).
    * - Does not support `<` operators that are non-commutative.
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -542,6 +557,11 @@ struct DeviceSegmentedReduce
    *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
    *   is specified as `segment_offsets + 1`).
    * - Does not support `<` operators that are non-commutative.
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -705,6 +725,11 @@ struct DeviceSegmentedReduce
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `>` operators that are non-commutative.
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -845,7 +870,7 @@ struct DeviceSegmentedReduce
    *        that item
    *
    * @par
-   * - The output value type of `d_out` is cub::KeyValuePair `<int, T>` 
+   * - The output value type of `d_out` is `cub::KeyValuePair<int, T>` 
    *   (assuming the value type of `d_in` is `T`)
    *   - The maximum of the *i*<sup>th</sup> segment is written to 
    *     `d_out[i].value` and its offset in that segment is written to 
@@ -857,6 +882,11 @@ struct DeviceSegmentedReduce
    *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
    *   the latter is specified as `segment_offsets + 1`).
    * - Does not support `>` operators that are non-commutative.
+   * - Let `s` be in `[0, num_segments)`. The range 
+   *   `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not 
+   *   overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`,
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)`.
    * - @devicestorage
    *
    * @par Snippet
@@ -983,14 +1013,9 @@ struct DeviceSegmentedReduce
 
     ArgIndexInputIteratorT d_indexed_in(d_in);
 
-    // Initial value
-    OutputTupleT initial_value(1, Traits<InputValueT>::Lowest()); // replace
-                                                                  // with
-                                                                  // std::numeric_limits<T>::lowest()
-                                                                  // when C++11
-                                                                  // support is
-                                                                  // more
-                                                                  // prevalent
+    // Initial value, replace with std::numeric_limits<T>::lowest() when C++11 
+    // support is more prevalent
+    OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());
 
     return DispatchSegmentedReduce<ArgIndexInputIteratorT,
                                    OutputIteratorT,