Merge pull request #466 from senior-zero/enh-main/github/adj_diff_size_t

Alternative approach to 64-bit indexing in adjacent difference
NVIDIA · May 5, 2022 · c7aca46 · c7aca46
2 parents 0b4e9eb + 91d7d6b
commit c7aca46
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 112 deletions.
diff --git a/cub/detail/choose_offset.cuh b/cub/detail/choose_offset.cuh
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cstdint>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type based on it.
+ */
+template <typename NumItemsT>
+struct ChooseOffsetT
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(
+    std::is_integral<NumItemsT>::value &&
+      !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
+    "NumItemsT must be an integral type, but not bool");
+
+  // Unsigned integer type for global offsets.
+  using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
+                                         std::uint32_t,
+                                         unsigned long long>::type;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
+
diff --git a/cub/device/device_adjacent_difference.cuh b/cub/device/device_adjacent_difference.cuh
@@ -27,9 +27,10 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_namespace.cuh"
-#include "dispatch/dispatch_adjacent_difference.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
+#include <cub/util_namespace.cuh>
 
 #include <thrust/detail/integer_traits.h>
 #include <thrust/detail/cstdint.h>
@@ -98,9 +99,9 @@ CUB_NAMESPACE_BEGIN
 struct DeviceAdjacentDifference
 {
 private:
-
   template <bool in_place,
             bool read_left,
+            typename NumItemsT,
             typename InputIteratorT,
             typename OutputIteratorT,
             typename DifferenceOpT>
@@ -109,53 +110,28 @@ private:
                      std::size_t &temp_storage_bytes,
                      InputIteratorT d_input,
                      OutputIteratorT d_output,
-                     std::size_t num_items,
+                     NumItemsT num_items,
                      DifferenceOpT difference_op,
                      cudaStream_t stream,
                      bool debug_synchronous)
   {
-    const auto uint64_threshold = static_cast<std::size_t>(
-      THRUST_NS_QUALIFIER::detail::integer_traits<
-        THRUST_NS_QUALIFIER::detail::int32_t>::const_max);
-
-    if (num_items <= uint64_threshold)
-    {
-      using OffsetT = std::uint32_t;
-      using DispatchT = DispatchAdjacentDifference<InputIteratorT,
-                                                   OutputIteratorT,
-                                                   DifferenceOpT,
-                                                   OffsetT,
-                                                   in_place,
-                                                   read_left>;
+    using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
 
-      return DispatchT::Dispatch(d_temp_storage,
-                                 temp_storage_bytes,
-                                 d_input,
-                                 d_output,
-                                 static_cast<OffsetT>(num_items),
-                                 difference_op,
-                                 stream,
-                                 debug_synchronous);
-    }
-    else
-    {
-      using OffsetT = std::uint64_t;
-      using DispatchT = DispatchAdjacentDifference<InputIteratorT,
-                                                   OutputIteratorT,
-                                                   DifferenceOpT,
-                                                   OffsetT,
-                                                   in_place,
-                                                   read_left>;
+    using DispatchT = DispatchAdjacentDifference<InputIteratorT,
+                                                 OutputIteratorT,
+                                                 DifferenceOpT,
+                                                 OffsetT,
+                                                 in_place,
+                                                 read_left>;
 
-      return DispatchT::Dispatch(d_temp_storage,
-                                 temp_storage_bytes,
-                                 d_input,
-                                 d_output,
-                                 static_cast<OffsetT>(num_items),
-                                 difference_op,
-                                 stream,
-                                 debug_synchronous);
-    }
+    return DispatchT::Dispatch(d_temp_storage,
+                               temp_storage_bytes,
+                               d_input,
+                               d_output,
+                               static_cast<OffsetT>(num_items),
+                               difference_op,
+                               stream,
+                               debug_synchronous);
   }
 
 public:
@@ -234,6 +210,8 @@ public:
    *   Its `result_type` is convertible to a type in `OutputIteratorT`'s set of
    *   `value_types`.
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage
    *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to `temp_storage_bytes` and no work
@@ -265,13 +243,14 @@ public:
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
-            typename DifferenceOpT = cub::Difference>
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
   static CUB_RUNTIME_FUNCTION cudaError_t
   SubtractLeftCopy(void *d_temp_storage,
                    std::size_t &temp_storage_bytes,
                    InputIteratorT d_input,
                    OutputIteratorT d_output,
-                   std::size_t num_items,
+                   NumItemsT num_items,
                    DifferenceOpT difference_op = {},
                    cudaStream_t stream         = 0,
                    bool debug_synchronous      = false)
@@ -353,6 +332,8 @@ public:
    *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
    *   set of `value_types`.
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage
    *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to `temp_storage_bytes` and no work
@@ -380,12 +361,13 @@ public:
    *   be printed to the console. Default is `false`.
    */
   template <typename RandomAccessIteratorT,
-            typename DifferenceOpT = cub::Difference>
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
   static CUB_RUNTIME_FUNCTION cudaError_t
   SubtractLeft(void *d_temp_storage,
                std::size_t &temp_storage_bytes,
                RandomAccessIteratorT d_input,
-               std::size_t num_items,
+               NumItemsT num_items,
                DifferenceOpT difference_op = {},
                cudaStream_t stream         = 0,
                bool debug_synchronous      = false)
@@ -477,6 +459,8 @@ public:
    *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
    *   set of `value_types`.
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage
    *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to `temp_storage_bytes` and no work
@@ -508,13 +492,14 @@ public:
    */
   template <typename InputIteratorT,
             typename OutputIteratorT,
-            typename DifferenceOpT = cub::Difference>
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
   static CUB_RUNTIME_FUNCTION cudaError_t
   SubtractRightCopy(void *d_temp_storage,
                     std::size_t &temp_storage_bytes,
                     InputIteratorT d_input,
                     OutputIteratorT d_output,
-                    std::size_t num_items,
+                    NumItemsT num_items,
                     DifferenceOpT difference_op = {},
                     cudaStream_t stream         = 0,
                     bool debug_synchronous      = false)
@@ -586,6 +571,8 @@ public:
    *   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
    *   set of `value_types`.
    *
+   * @tparam NumItemsT **[inferred]** Type of num_items
+   *
    * @param[in] d_temp_storage
    *   Device-accessible allocation of temporary storage. When `nullptr`, the
    *   required allocation size is written to `temp_storage_bytes` and no work
@@ -613,12 +600,13 @@ public:
    *  printed to the console. Default is `false`.
    */
   template <typename RandomAccessIteratorT,
-            typename DifferenceOpT = cub::Difference>
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT = std::uint32_t>
   static CUB_RUNTIME_FUNCTION cudaError_t
   SubtractRight(void *d_temp_storage,
                 std::size_t &temp_storage_bytes,
                 RandomAccessIteratorT d_input,
-                std::size_t num_items,
+                NumItemsT num_items,
                 DifferenceOpT difference_op = {},
                 cudaStream_t stream         = 0,
                 bool debug_synchronous      = false)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
@@ -34,32 +34,12 @@
 
 #pragma once
 
-#include <stdio.h>
-#include <iterator>
-#include <type_traits>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../config.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/choose_offset.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>
 
 CUB_NAMESPACE_BEGIN
 
-namespace detail {
-/** ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
- * selects the offset type based on it. */
-template <typename NumItemsT>
-struct ChooseOffsetT
-{
-    // NumItemsT must be an integral type (but not bool).
-    static_assert(std::is_integral<NumItemsT>::value &&
-                  !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
-                  "NumItemsT must be an integral type, but not bool");
-
-    // Unsigned integer type for global offsets.
-    using Type = typename std::conditional<sizeof(NumItemsT) <= 4, uint32_t, unsigned long long>::type;
-};
-
-}  // namespace detail
-
 /**
  * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
  * \ingroup SingleModule