From 66ea19ba91bd6a4ddb5641e3c487f938401bd13a Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 27 May 2022 20:05:37 +0400
Subject: [PATCH 1/4] In-place guarantees for segmented sort

---
 cub/device/device_segmented_sort.cuh | 82 +++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/cub/device/device_segmented_sort.cuh b/cub/device/device_segmented_sort.cuh
index bc80275fd0..1d73f61152 100644
--- a/cub/device/device_segmented_sort.cuh
+++ b/cub/device/device_segmented_sort.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,6 +131,10 @@ struct DeviceSegmentedSort
    *   @p j are equivalent: neither one is less than the other. It is not
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -275,6 +279,10 @@ struct DeviceSegmentedSort
    *   @p i and @p j are equivalent: neither one is less than the other. It is
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -426,6 +434,11 @@ struct DeviceSegmentedSort
    *   @p i and @p j are equivalent: neither one is less than the other. It is
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -578,6 +591,11 @@ struct DeviceSegmentedSort
    *   @p i and @p j are equivalent: neither one is less than the other. It is
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -724,6 +742,10 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -857,6 +879,10 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -998,6 +1024,11 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1138,6 +1169,11 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1278,6 +1314,11 @@ struct DeviceSegmentedSort
    *   @p j are equivalent: neither one is less than the other. It is not
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1442,6 +1483,11 @@ struct DeviceSegmentedSort
    *   @p j are equivalent: neither one is less than the other. It is not
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1615,6 +1661,12 @@ struct DeviceSegmentedSort
    *   @p j are equivalent: neither one is less than the other. It is not
    *   guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1782,6 +1834,12 @@ struct DeviceSegmentedSort
    *   @p i and @p j are equivalent: neither one is less than the other. It is
    *   not guaranteed that the relative order of these two elements will be
    *   preserved by sort.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -1941,6 +1999,11 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2098,6 +2161,11 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2264,6 +2332,12 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments
@@ -2425,6 +2499,12 @@ struct DeviceSegmentedSort
    *   @p x precedes @p y, and if the two elements are equivalent (neither
    *   @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that
    *   @p x still precedes @p y.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    *
    * @par Snippet
    * The code snippet below illustrates the batched sorting of three segments

From 5a66f1f6981f718990ac65d6f7339a95aff5ec4c Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 30 May 2022 10:33:54 +0400
Subject: [PATCH 2/4] Reformat segmented radix sort docs

---
 cub/device/device_segmented_radix_sort.cuh | 2217 ++++++++++++--------
 1 file changed, 1388 insertions(+), 829 deletions(-)

diff --git a/cub/device/device_segmented_radix_sort.cuh b/cub/device/device_segmented_radix_sort.cuh
index 3e5e90db00..135ea6d8c5 100644
--- a/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/device/device_segmented_radix_sort.cuh
@@ -1,7 +1,6 @@
-
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +13,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,8 +27,10 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ * @file cub::DeviceSegmentedRadixSort provides device-wide, parallel 
+ *       operations for computing a batched radix sort across multiple, 
+ *       non-overlapping sequences of data items residing within 
+ *       device-accessible memory.
  */
 
 #pragma once
@@ -37,844 +38,1402 @@
 #include <stdio.h>
 #include <iterator>
 
-#include "../config.cuh"
-#include "dispatch/dispatch_radix_sort.cuh"
+#include <cub/config.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>
 
 CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
- * \ingroup SegmentedModule
+ * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations 
+ *        for computing a batched radix sort across multiple, non-overlapping 
+ *        sequences of data items residing within device-accessible memory. 
+ *        ![](segmented_sorting_logo.png)
+ * @ingroup SegmentedModule
  *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
+ * @par Overview
+ * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) 
+ * arranges items into ascending (or descending) order. The algorithm relies 
+ * upon a positional representation for keys, i.e., each key is comprised of an 
+ * ordered sequence of symbols (e.g., digits, characters, etc.) specified from 
+ * least-significant to most-significant.  For a given input sequence of keys 
+ * and a set of rules specifying a total ordering of the symbolic alphabet, the 
+ * radix sorting method produces a lexicographic ordering of those keys.
  *
- * \par See Also
+ * @par See Also
  * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
  * that algorithm's documentation for more information.
  *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedRadixSort}
+ * @par Usage Considerations
+ * @cdp_class{DeviceSegmentedRadixSort}
  *
  */
 struct DeviceSegmentedRadixSort
 {
+  /******************************************************************//**
+   * @name Key-value pairs
+   *********************************************************************/
+  //@{
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order. 
+   *        (`~2N` auxiliary storage required)
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys with associated vector of 
+   * `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>  
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortPairs(
+         d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+   * @endcode
+   *
+   * @tparam KeyT                  
+   *   **[inferred]** Key type
+   *
+   * @tparam ValueT                
+   *   **[inferred]** Value type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in 
+   *   Device-accessible pointer to the corresponding input sequence of 
+   *   associated value items
+   *
+   * @param[out] d_values_out 
+   *   Device-accessible pointer to the correspondingly-reordered output 
+   *   sequence of associated value items
+   *
+   * @param[in] num_items 
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments 
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets 
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets 
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If 
+   *   `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            const KeyT *d_keys_in,
+            KeyT *d_keys_out,
+            const ValueT *d_values_in,
+            ValueT *d_values_out,
+            int num_items,
+            int num_segments,
+            BeginOffsetIteratorT d_begin_offsets,
+            EndOffsetIteratorT d_end_offsets,
+            int begin_bit          = 0,
+            int end_bit            = sizeof(KeyT) * 8,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
+                                  d_values_out);
+
+    return DispatchSegmentedRadixSort<false,
+                                      KeyT,
+                                      ValueT,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         false,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into ascending order. 
+   *        (`~N` auxiliary storage required)
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the 
+   *   sorting operation.
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within each DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the number 
+   *   of key bits specified and the targeted device architecture).
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and yield 
+   *   a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys with associated vector of 
+   * `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortPairs(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam ValueT           
+   *   **[inferred]** Value type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values 
+   *   Double-buffer of values whose "current" device-accessible buffer 
+   *   contains the unsorted input values and, upon return, is updated to point 
+   *   to the sorted output values
+   *
+   * @param[in] num_items 
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments 
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets 
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets 
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            DoubleBuffer<KeyT> &d_keys,
+            DoubleBuffer<ValueT> &d_values,
+            int num_items,
+            int num_segments,
+            BeginOffsetIteratorT d_begin_offsets,
+            EndOffsetIteratorT d_end_offsets,
+            int begin_bit          = 0,
+            int end_bit            = sizeof(KeyT) * 8,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchSegmentedRadixSort<false,
+                                      KeyT,
+                                      ValueT,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         true,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order. 
+   *        (`~2N` auxiliary storage required).
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys with associated vector of 
+   * `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam ValueT           
+   *   **[inferred]** Value type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in 
+   *   Device-accessible pointer to the corresponding input sequence of 
+   *   associated value items
+   *
+   * @param[out] d_values_out 
+   *   Device-accessible pointer to the correspondingly-reordered output 
+   *   sequence of associated value items
+   *
+   * @param[in] num_items 
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments 
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets 
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets 
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> 
+   *   is considered empty.
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      const KeyT *d_keys_in,
+                      KeyT *d_keys_out,
+                      const ValueT *d_values_in,
+                      ValueT *d_values_out,
+                      int num_items,
+                      int num_segments,
+                      BeginOffsetIteratorT d_begin_offsets,
+                      EndOffsetIteratorT d_end_offsets,
+                      int begin_bit          = 0,
+                      int end_bit            = sizeof(KeyT) * 8,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
+                                  d_values_out);
+
+    return DispatchSegmentedRadixSort<true,
+                                      KeyT,
+                                      ValueT,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         false,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of key-value pairs into descending order. 
+   *        (`~N` auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the 
+   *   sorting operation.
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within each DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the number 
+   *   of key bits specified and the targeted device architecture).
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is 
+   *   specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys with associated vector of 
+   * `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys, d_values,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam ValueT           
+   *   **[inferred]** Value type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values 
+   *   Double-buffer of values whose "current" device-accessible buffer 
+   *   contains the unsorted input values and, upon return, is updated to point 
+   *   to the sorted output values
+   *
+   * @param[in] num_items 
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments 
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets 
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets 
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> 
+   *   is considered empty.
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename ValueT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      DoubleBuffer<KeyT> &d_keys,
+                      DoubleBuffer<ValueT> &d_values,
+                      int num_items,
+                      int num_segments,
+                      BeginOffsetIteratorT d_begin_offsets,
+                      EndOffsetIteratorT d_end_offsets,
+                      int begin_bit          = 0,
+                      int end_bit            = sizeof(KeyT) * 8,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchSegmentedRadixSort<true,
+                                      KeyT,
+                                      ValueT,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         true,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  //@}  end member group
+  /******************************************************************//**
+   * @name Keys-only
+   *********************************************************************/
+  //@{
+
+
+  /**
+   * @brief Sorts segments of keys into ascending order. 
+   *        (`~2N` auxiliary storage required)
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   is specified as `segment_offsets + 1`).
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortKeys( 
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortKeys( 
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+   *
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in  
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out  
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items  
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] begin_bit  
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit  
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           const KeyT *d_keys_in,
+           KeyT *d_keys_out,
+           int num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           int begin_bit          = 0,
+           int end_bit            = sizeof(KeyT) * 8,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // Null value type
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<false,
+                                      KeyT,
+                                      NullType,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         false,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a 
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within the DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the 
+   *   number of key bits specified and the targeted device architecture).
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   is specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortKeys(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+   *
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys  
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items  
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   *   If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*<sup>th</sup>
+   *   is considered empty.
+   *
+   * @param[in] begin_bit  
+   *   **[optional]** The least-significant bit index (inclusive)  
+   *   needed for key comparison
+   *
+   * @param[in] end_bit  
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors.  Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           DoubleBuffer<KeyT> &d_keys,
+           int num_items,
+           int num_segments,
+           BeginOffsetIteratorT d_begin_offsets,
+           EndOffsetIteratorT d_end_offsets,
+           int begin_bit          = 0,
+           int end_bit            = sizeof(KeyT) * 8,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<false,
+                                      KeyT,
+                                      NullType,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         true,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. 
+   * (`~2N` auxiliary storage required).
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
+   *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
+   *   is specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+   *
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in  
+   *   Device-accessible pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out  
+   *   Device-accessible pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items  
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`. 
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] begin_bit  
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit  
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., sizeof(unsigned int) * 8)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors.  Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     const KeyT *d_keys_in,
+                     KeyT *d_keys_out,
+                     int num_items,
+                     int num_segments,
+                     BeginOffsetIteratorT d_begin_offsets,
+                     EndOffsetIteratorT d_end_offsets,
+                     int begin_bit          = 0,
+                     int end_bit            = sizeof(KeyT) * 8,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<true,
+                                      KeyT,
+                                      NullType,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         false,
+                                                         stream,
+                                                         debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts segments of keys into descending order. 
+   * (`~N` auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within the DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the 
+   *   number of key bits specified and the targeted device architecture).
+   * - When input a contiguous sequence of segments, a single sequence
+   *   `segment_offsets` (of length `num_segments + 1`) can be aliased
+   *   for both the `d_begin_offsets` and `d_end_offsets` parameters (where
+   *   the latter is specified as `segment_offsets + 1`).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Snippet
+   * The code snippet below illustrates the batched sorting of three segments 
+   * (with one zero-length segment) of `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  num_segments;       // e.g., 3
+   * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceSegmentedRadixSort::SortKeysDescending(
+   *     d_temp_storage, temp_storage_bytes, d_keys,
+   *     num_items, num_segments, d_offsets, d_offsets + 1);
+   *
+   * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT             
+   *   **[inferred]** Key type
+   *
+   * @tparam BeginOffsetIteratorT  
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   beginning offsets \iterator
+   *
+   * @tparam EndOffsetIteratorT    
+   *   **[inferred]** Random-access input iterator type for reading segment 
+   *   ending offsets \iterator
+   *
+   * @param[in] d_temp_storage  
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes  
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys  
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items  
+   *   The total number of items to sort (across all segments)
+   *
+   * @param[in] num_segments  
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets  
+   *   Random-access input iterator to the sequence of beginning offsets of 
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first 
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and 
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets  
+   *   Random-access input iterator to the sequence of ending offsets of length 
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of 
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.  
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*<sup>th</sup> is 
+   *   considered empty.
+   *
+   * @param[in] begin_bit  
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit  
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream  
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous  
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console.  Default is `false`.
+   */
+  template <typename KeyT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     DoubleBuffer<KeyT> &d_keys,
+                     int num_items,
+                     int num_segments,
+                     BeginOffsetIteratorT d_begin_offsets,
+                     EndOffsetIteratorT d_end_offsets,
+                     int begin_bit          = 0,
+                     int end_bit            = sizeof(KeyT) * 8,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    // Signed integer type for global offsets
+    using OffsetT = int;
 
-    /******************************************************************//**
-     * \name Key-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT                  <b>[inferred]</b> Key type
-     * \tparam ValueT                <b>[inferred]</b> Value type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                BeginOffsetIteratorT,
-        typename                EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                BeginOffsetIteratorT,
-        typename                EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmented_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
-     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            BeginOffsetIteratorT,
-        typename            EndOffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
+    // Null value type
+    DoubleBuffer<NullType> d_values;
 
+    return DispatchSegmentedRadixSort<true,
+                                      KeyT,
+                                      NullType,
+                                      BeginOffsetIteratorT,
+                                      EndOffsetIteratorT,
+                                      OffsetT>::Dispatch(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         d_keys,
+                                                         d_values,
+                                                         num_items,
+                                                         num_segments,
+                                                         d_begin_offsets,
+                                                         d_end_offsets,
+                                                         begin_bit,
+                                                         end_bit,
+                                                         true,
+                                                         stream,
+                                                         debug_synchronous);
+  }
 
+  //@}  end member group
 };
 
 CUB_NAMESPACE_END

From 27312acd9653b615c8598176721abbc74842d181 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 30 May 2022 19:55:32 +0400
Subject: [PATCH 3/4] Add in-place guarantees for seg rad sort

---
 cub/device/device_segmented_radix_sort.cuh | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/cub/device/device_segmented_radix_sort.cuh b/cub/device/device_segmented_radix_sort.cuh
index 135ea6d8c5..4b610e7626 100644
--- a/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/device/device_segmented_radix_sort.cuh
@@ -88,6 +88,11 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
    *   the sorting interface using DoubleBuffer wrappers below.
    * - @devicestorage
@@ -274,6 +279,12 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and yield 
    *   a corresponding performance improvement.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageP
    * - @devicestorage
    *
@@ -443,6 +454,11 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of
+   *   `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall 
+   *   not overlap `[in, in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
    *   the sorting interface using DoubleBuffer wrappers below.
    * - @devicestorage
@@ -629,6 +645,12 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` 
+   *   be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range 
+   *   `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageP
    * - @devicestorage
    *
@@ -804,6 +826,10 @@ struct DeviceSegmentedRadixSort
    *   `segment_offsets` (of length `num_segments + 1`) can be aliased for both 
    *   the `d_begin_offsets` and `d_end_offsets` parameters (where the latter 
    *   is specified as `segment_offsets + 1`).
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
    *   the sorting interface using DoubleBuffer wrappers below.
    * - @devicestorage
@@ -966,6 +992,11 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageP
    * - @devicestorage
    *
@@ -1123,6 +1154,10 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap
+   *   `[d_keys_in, d_keys_in + num_items)`, 
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
    *   the sorting interface using DoubleBuffer wrappers below.
    * - @devicestorage
@@ -1290,6 +1325,11 @@ struct DeviceSegmentedRadixSort
    * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
    *   bits can be specified. This can reduce overall sorting overhead and 
    *   yield a corresponding performance improvement.
+   * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`.
+   *   The range `[cur, cur + num_items)` shall not overlap 
+   *   `[alt, alt + num_items)`. Both ranges shall not overlap
+   *   `[d_begin_offsets, d_begin_offsets + num_segments)` nor
+   *   `[d_end_offsets, d_end_offsets + num_segments)` in any way.
    * - @devicestorageP
    * - @devicestorage
    *

From fe39cd3cca2b3eaac093dc4ee4222dade2355630 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 30 May 2022 20:54:10 +0400
Subject: [PATCH 4/4] Reformat radix sort docs

---
 cub/device/device_radix_sort.cuh | 2003 ++++++++++++++++++------------
 1 file changed, 1179 insertions(+), 824 deletions(-)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 87711d0587..68f529c9b5 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -1,7 +1,7 @@
 
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -14,10 +14,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -28,8 +28,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ * @file cub::DeviceRadixSort provides device-wide, parallel operations for 
+ *       computing a radix sort across a sequence of data items residing within 
+ *       device-accessible memory.
  */
 
 #pragma once
@@ -41,31 +42,33 @@
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
- * \ingroup SingleModule
+ * @brief DeviceRadixSort provides device-wide, parallel operations for 
+ *        computing a radix sort across a sequence of data items residing 
+ *        within device-accessible memory. ![](sorting_logo.png)
+ * @ingroup SingleModule
  *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
+ * @par Overview
+ * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) 
+ * arranges items into ascending (or descending) order. The algorithm relies 
+ * upon a positional representation for keys, i.e., each key is comprised of an 
+ * ordered sequence of symbols (e.g., digits, characters, etc.) specified from 
+ * least-significant to most-significant. For a given input sequence of keys 
+ * and a set of rules specifying a total ordering of the symbolic alphabet, the 
+ * radix sorting method produces a lexicographic ordering of those keys.
  *
- * \par Supported Types
+ * @par Supported Types
  * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
  * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half`
  * and `__nv_bfloat16` 16-bit floating-point types.
  *
- * \par Floating-Point Special Cases
+ * @par Floating-Point Special Cases
  *
  * - Positive and negative zeros are considered equivalent, and will be treated
  *   as such in the output.
  * - No special handling is implemented for NaN values; these are sorted
  *   according to their bit representations after any transformations.
  *
- * \par Transformations
+ * @par Transformations
  * Although the direct radix sorting method can only be applied to unsigned
  * integral types, DeviceRadixSort is able to sort signed and floating-point
  * types via simple bit-wise transformations that ensure lexicographic key
@@ -89,829 +92,1181 @@ CUB_NAMESPACE_BEGIN
  * For floating point types, positive and negative zero are a special case and
  * will be considered equivalent during sorting.
  *
- * \par Descending Sort Bitwise Transformations
+ * @par Descending Sort Bitwise Transformations
  * If descending sort is used, the keys are inverted after performing any
  * type-specific transformations, and the resulting keys are sorted in ascending
  * order.
  *
- * \par Stability
- * DeviceRadixSort is stable. For floating-point types, -0.0 and +0.0 are
+ * @par Stability
+ * DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are
  * considered equal and appear in the result in the same order as they appear in
  * the input.
  *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
+ * @par Usage Considerations
+ * @cdp_class{DeviceRadixSort}
  *
- * \par Performance
- * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
- * performance across different CUDA architectures for uniform-random \p uint32 keys.
- * \plots_below
+ * @par Performance
+ * @linear_performance{radix sort} The following chart illustrates 
+ * DeviceRadixSort::SortKeys performance across different CUDA architectures 
+ * for uniform-random `uint32` keys.
+ * @plots_below
  *
- * \image html lsb_radix_sort_int32_keys.png
+ * @image html lsb_radix_sort_int32_keys.png
  *
  */
 struct DeviceRadixSort
 {
 
-    /******************************************************************//**
-     * \name KeyT-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation.
-     * - Pointers to contiguous memory must be used; iterators are not currently
-     *   supported.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys_in,    d_keys_in    + num_items)`
-     *   - `[d_keys_out,   d_keys_out   + num_items)`
-     *   - `[d_values_in,  d_values_in  + num_items)`
-     *   - `[d_values_out, d_values_out + num_items)`
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-        
-        // We cast away const-ness, but will *not* write to these arrays.
-        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
-        // create a new double-buffer internally when the `is_overwrite_ok` flag
-        // is not set.
-        constexpr bool is_overwrite_okay = false;
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            static_cast<OffsetT>(num_items),
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
-     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
-     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        NumItemsT               num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        constexpr bool is_overwrite_okay = true;
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation.
-     * - Pointers to contiguous memory must be used; iterators are not currently
-     *   supported.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys_in,    d_keys_in    + num_items)`
-     *   - `[d_keys_out,   d_keys_out   + num_items)`
-     *   - `[d_values_in,  d_values_in  + num_items)`
-     *   - `[d_values_out, d_values_out + num_items)`
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        // We cast away const-ness, but will *not* write to these arrays.
-        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
-        // create a new double-buffer internally when the `is_overwrite_ok` flag
-        // is not set.
-        constexpr bool is_overwrite_okay = false;
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
-     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
-     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        NumItemsT               num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        constexpr bool is_overwrite_okay = true;
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation.
-     * - Pointers to contiguous memory must be used; iterators are not currently
-     *   supported.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys_in,    d_keys_in    + num_items)`
-     *   - `[d_keys_out,   d_keys_out   + num_items)`
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <typename KeyT,
-              typename NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        // We cast away const-ness, but will *not* write to these arrays.
-        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
-        // create a new double-buffer internally when the `is_overwrite_ok` flag
-        // is not set.
-        constexpr bool is_overwrite_okay = false;
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        // Null value type
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            static_cast<OffsetT>(num_items),
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
-     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <typename KeyT,
-              typename NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        constexpr bool is_overwrite_okay = true;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation.
-     * - Pointers to contiguous memory must be used; iterators are not currently
-     *   supported.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys_in,    d_keys_in    + num_items)`
-     *   - `[d_keys_out,   d_keys_out   + num_items)`
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <typename KeyT,
-              typename NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        // We cast away const-ness, but will *not* write to these arrays.
-        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
-        // create a new double-buffer internally when the `is_overwrite_ok` flag
-        // is not set.
-        constexpr bool is_overwrite_okay = false;
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - In-place operations are not supported. There must be no overlap between
-     *   any of the provided ranges:
-     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
-     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam NumItemsT <b>[inferred]</b> Type of num_items
-     */
-    template <typename KeyT,
-              typename NumItemsT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        NumItemsT           num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Unsigned integer type for global offsets.
-        using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
-
-        constexpr bool is_overwrite_okay = true;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            is_overwrite_okay,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
+  /******************************************************************//**
+   * \name KeyT-value pairs
+   *********************************************************************/
+  //@{
 
+  /**
+   * @brief Sorts key-value pairs into ascending order. 
+   *        (`~2N` auxiliary storage required)
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - Pointers to contiguous memory must be used; iterators are not currently
+   *   supported.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys_in,    d_keys_in    + num_items)`
+   *   - `[d_keys_out,   d_keys_out   + num_items)`
+   *   - `[d_values_in,  d_values_in  + num_items)`
+   *   - `[d_values_out, d_values_out + num_items)`
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated sorting performance across 
+   * different CUDA architectures for uniform-random `uint32, uint32` and
+   * `uint64, uint64` pairs, respectively.
+   *
+   * @image html lsb_radix_sort_int32_pairs.png
+   * @image html lsb_radix_sort_int64_pairs.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [        ...        ]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [        ...        ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+   *
+   * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+   * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam ValueT    
+   *   **[inferred]** ValueT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in 
+   *   Pointer to the corresponding input sequence of associated value items
+   *
+   * @param[out] d_values_out 
+   *   Pointer to the correspondingly-reordered output sequence of associated 
+   *   value items
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., sizeof(unsigned int) * 8)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            const KeyT *d_keys_in,
+            KeyT *d_keys_out,
+            const ValueT *d_values_in,
+            ValueT *d_values_out,
+            NumItemsT num_items,
+            int begin_bit          = 0,
+            int end_bit            = sizeof(KeyT) * 8,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
+                                  d_values_out);
+
+    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      static_cast<OffsetT>(num_items),
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts key-value pairs into ascending order. 
+   *        (`~N` auxiliary storage required)
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the 
+   *   sorting operation.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+   *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+   *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+   *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within each DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the 
+   *   number of key bits specified and the targeted device architecture).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated sorting performance across 
+   * different CUDA architectures for uniform-random `uint32, uint32` and
+   * `uint64, uint64` pairs, respectively.
+   *
+   * @image html lsb_radix_sort_int32_pairs.png
+   * @image html lsb_radix_sort_int64_pairs.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int` 
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for 
+   * // sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [        ...        ]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [        ...        ]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+   *
+   * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+   * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam ValueT    
+   *   **[inferred]** ValueT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to \p temp_storage_bytes and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values 
+   *   Double-buffer of values whose "current" device-accessible buffer 
+   *   contains the unsorted input values and, upon return, is updated to point 
+   *   to the sorted output values
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairs(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            DoubleBuffer<KeyT> &d_keys,
+            DoubleBuffer<ValueT> &d_values,
+            NumItemsT num_items,
+            int begin_bit          = 0,
+            int end_bit            = sizeof(KeyT) * 8,
+            cudaStream_t stream    = 0,
+            bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    constexpr bool is_overwrite_okay = true;
+
+    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts key-value pairs into descending order. 
+   *        (`~2N` auxiliary storage required).
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - Pointers to contiguous memory must be used; iterators are not currently
+   *   supported.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys_in,    d_keys_in    + num_items)`
+   *   - `[d_keys_out,   d_keys_out   + num_items)`
+   *   - `[d_values_in,  d_values_in  + num_items)`
+   *   - `[d_values_out, d_values_out + num_items)`
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP  For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Performance
+   * Performance is similar to DeviceRadixSort::SortPairs.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int` 
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [        ...        ]
+   * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_values_out;      // e.g., [        ...        ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortPairsDescending(
+   *     d_temp_storage, temp_storage_bytes,
+   *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+   *
+   * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+   * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam ValueT    
+   *   **[inferred]** ValueT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @param[in] d_keys_in 
+   *   Pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Pointer to the sorted output sequence of key data
+   *
+   * @param[in] d_values_in 
+   *   Pointer to the corresponding input sequence of associated value items
+   *
+   * @param[out] d_values_out 
+   *   Pointer to the correspondingly-reordered output sequence of associated 
+   *   value items
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      const KeyT *d_keys_in,
+                      KeyT *d_keys_out,
+                      const ValueT *d_values_in,
+                      ValueT *d_values_out,
+                      NumItemsT num_items,
+                      int begin_bit          = 0,
+                      int end_bit            = sizeof(KeyT) * 8,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
+                                  d_values_out);
+
+    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts key-value pairs into descending order. 
+   *        (`~N` auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers and a corresponding
+   *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+   *   structure that indicates which of the two buffers is "current" (and thus
+   *   contains the input data to be sorted).
+   * - The contents of both buffers within each pair may be altered by the 
+   *   sorting operation.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+   *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+   *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+   *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within each DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the number 
+   *   of key bits specified and the targeted device architecture).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Performance
+   * Performance is similar to DeviceRadixSort::SortPairs.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int` 
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [        ...        ]
+   * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * int  *d_value_alt_buf;   // e.g., [        ...        ]
+   * ...
+   *
+   * // Create a set of DoubleBuffers to wrap pairs of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortPairsDescending(
+   *   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortPairsDescending(
+   *   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+   *
+   * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+   * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam ValueT    
+   *   **[inferred]** ValueT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in,out] d_values 
+   *   Double-buffer of values whose "current" device-accessible buffer 
+   *   contains the unsorted input values and, upon return, is updated to point 
+   *   to the sorted output values
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortPairsDescending(void *d_temp_storage,
+                      size_t &temp_storage_bytes,
+                      DoubleBuffer<KeyT> &d_keys,
+                      DoubleBuffer<ValueT> &d_values,
+                      NumItemsT num_items,
+                      int begin_bit          = 0,
+                      int end_bit            = sizeof(KeyT) * 8,
+                      cudaStream_t stream    = 0,
+                      bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    constexpr bool is_overwrite_okay = true;
+
+    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  //@}  end member group
+  /******************************************************************//**
+   * @name Keys-only
+   *********************************************************************/
+  //@{
+
+
+  /**
+   * @brief Sorts keys into ascending order. 
+   *        (`~2N` auxiliary storage required)
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - Pointers to contiguous memory must be used; iterators are not currently
+   *   supported.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys_in,    d_keys_in    + num_items)`
+   *   - `[d_keys_out,   d_keys_out   + num_items)`
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP  For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated sorting performance across 
+   * different CUDA architectures for uniform-random `uint32` and `uint64` 
+   * keys, respectively.
+   *
+   * @image html lsb_radix_sort_int32_keys.png
+   * @image html lsb_radix_sort_int64_keys.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of 
+   * `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [        ...        ]
+   * ...
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+   *
+   * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           const KeyT *d_keys_in,
+           KeyT *d_keys_out,
+           NumItemsT num_items,
+           int begin_bit          = 0,
+           int end_bit            = sizeof(KeyT) * 8,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      static_cast<OffsetT>(num_items),
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts keys into ascending order. (`~N` auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+   *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within the DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the 
+   *   number of key bits specified and the targeted device architecture).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Performance
+   * The following charts illustrate saturated sorting performance across 
+   * different CUDA architectures for uniform-random `uint32` and `uint64` 
+   * keys, respectively.
+   *
+   * @image html lsb_radix_sort_int32_keys.png
+   * @image html lsb_radix_sort_int64_keys.png
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of 
+   * `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [        ...        ]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+   *
+   * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeys(void *d_temp_storage,
+           size_t &temp_storage_bytes,
+           DoubleBuffer<KeyT> &d_keys,
+           NumItemsT num_items,
+           int begin_bit          = 0,
+           int end_bit            = sizeof(KeyT) * 8,
+           cudaStream_t stream    = 0,
+           bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    constexpr bool is_overwrite_okay = true;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts keys into descending order. 
+   *        (`~2N` auxiliary storage required).
+   *
+   * @par
+   * - The contents of the input data are not altered by the sorting operation.
+   * - Pointers to contiguous memory must be used; iterators are not currently
+   *   supported.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys_in,    d_keys_in    + num_items)`
+   *   - `[d_keys_out,   d_keys_out   + num_items)`
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageNP For sorting using only `O(P)` temporary storage, see 
+   *   the sorting interface using DoubleBuffer wrappers below.
+   * - @devicestorage
+   *
+   * @par Performance
+   * Performance is similar to DeviceRadixSort::SortKeys.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of 
+   * `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_keys_out;        // e.g., [        ...        ]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortKeysDescending( 
+   *   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortKeysDescending(
+   *   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+   *
+   * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+   *
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in 
+   *   Pointer to the input data of key data to sort
+   *
+   * @param[out] d_keys_out 
+   *   Pointer to the sorted output sequence of key data
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within.  
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     const KeyT *d_keys_in,
+                     KeyT *d_keys_out,
+                     NumItemsT num_items,
+                     int begin_bit          = 0,
+                     int end_bit            = sizeof(KeyT) * 8,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  /**
+   * @brief Sorts keys into descending order. 
+   *        (`~N` auxiliary storage required).
+   *
+   * @par
+   * - The sorting operation is given a pair of key buffers managed by a
+   *   DoubleBuffer structure that indicates which of the two buffers is
+   *   "current" (and thus contains the input data to be sorted).
+   * - The contents of both buffers may be altered by the sorting operation.
+   * - In-place operations are not supported. There must be no overlap between
+   *   any of the provided ranges:
+   *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+   *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+   * - Upon completion, the sorting operation will update the "current" 
+   *   indicator within the DoubleBuffer wrapper to reference which of the two 
+   *   buffers now contains the sorted output sequence (a function of the 
+   *   number of key bits specified and the targeted device architecture).
+   * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key 
+   *   bits can be specified. This can reduce overall sorting overhead and 
+   *   yield a corresponding performance improvement.
+   * - @devicestorageP
+   * - @devicestorage
+   *
+   * @par Performance
+   * Performance is similar to DeviceRadixSort::SortKeys.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of @p int keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   
+   * // or equivalently <cub/device/device_radix_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers 
+   * // for sorting data
+   * int  num_items;          // e.g., 7
+   * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_key_alt_buf;     // e.g., [        ...        ]
+   * ...
+   *
+   * // Create a DoubleBuffer to wrap the pair of device pointers
+   * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+   *
+   * // Determine temporary device storage requirements
+   * void     *d_temp_storage = NULL;
+   * size_t   temp_storage_bytes = 0;
+   * cub::DeviceRadixSort::SortKeysDescending(
+   *   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceRadixSort::SortKeysDescending(
+   *   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+   *
+   * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+   * @endcode
+   *
+   * @tparam KeyT      
+   *   **[inferred]** KeyT type
+   *
+   * @tparam NumItemsT 
+   *   **[inferred]** Type of num_items
+   *
+   * @param[in] d_temp_storage 
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the 
+   *   required allocation size is written to `temp_storage_bytes` and no work 
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes 
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys 
+   *   Reference to the double-buffer of keys whose "current" device-accessible 
+   *   buffer contains the unsorted input keys and, upon return, is updated to 
+   *   point to the sorted output keys
+   *
+   * @param[in] num_items 
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit 
+   *   **[optional]** The least-significant bit index (inclusive) needed for 
+   *   key comparison
+   *
+   * @param[in] end_bit 
+   *   **[optional]** The most-significant bit index (exclusive) needed for key 
+   *   comparison (e.g., `sizeof(unsigned int) * 8`)
+   *
+   * @param[in] stream 
+   *   **[optional]** CUDA stream to launch kernels within. 
+   *   Default is stream<sub>0</sub>.
+   *
+   * @param[in] debug_synchronous 
+   *   **[optional]** Whether or not to synchronize the stream after every 
+   *   kernel launch to check for errors. Also causes launch configurations to 
+   *   be printed to the console. Default is `false`.
+   */
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  SortKeysDescending(void *d_temp_storage,
+                     size_t &temp_storage_bytes,
+                     DoubleBuffer<KeyT> &d_keys,
+                     NumItemsT num_items,
+                     int begin_bit          = 0,
+                     int end_bit            = sizeof(KeyT) * 8,
+                     cudaStream_t stream    = 0,
+                     bool debug_synchronous = false)
+  {
+    // Unsigned integer type for global offsets.
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
+
+    constexpr bool is_overwrite_okay = true;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream,
+      debug_synchronous);
+  }
+
+  //@}  end member group
 
 };
 
 /**
- * \example example_device_radix_sort.cu
+ * @example example_device_radix_sort.cu
  */
 
 CUB_NAMESPACE_END