From fe9a3efbd1063be679833986e516c53cb0661a1c Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Wed, 27 Apr 2022 20:17:53 +0400 Subject: [PATCH 1/2] Alternative approach to 64-bit indexing in adj diff --- cub/device/device_adjacent_difference.cuh | 92 +++++++++++------------ test/test_device_adjacent_difference.cu | 92 ++++++++++++++--------- 2 files changed, 99 insertions(+), 85 deletions(-) diff --git a/cub/device/device_adjacent_difference.cuh b/cub/device/device_adjacent_difference.cuh index d6efb73f09..56df25329b 100644 --- a/cub/device/device_adjacent_difference.cuh +++ b/cub/device/device_adjacent_difference.cuh @@ -98,9 +98,9 @@ CUB_NAMESPACE_BEGIN struct DeviceAdjacentDifference { private: - template @@ -109,53 +109,35 @@ private: std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, - std::size_t num_items, + NumItemsT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { - const auto uint64_threshold = static_cast( - THRUST_NS_QUALIFIER::detail::integer_traits< - THRUST_NS_QUALIFIER::detail::int32_t>::const_max); + static_assert( + std::is_integral::value && + !std::is_same::type, bool>::value, + "NumItemsT must be an integral type, but not bool"); - if (num_items <= uint64_threshold) - { - using OffsetT = std::uint32_t; - using DispatchT = DispatchAdjacentDifference; + using OffsetT = std::conditional_t; - return DispatchT::Dispatch(d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - static_cast(num_items), - difference_op, - stream, - debug_synchronous); - } - else - { - using OffsetT = std::uint64_t; - using DispatchT = DispatchAdjacentDifference; + using DispatchT = DispatchAdjacentDifference; - return DispatchT::Dispatch(d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - static_cast(num_items), - difference_op, - stream, - debug_synchronous); - } + return DispatchT::Dispatch(d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + static_cast(num_items), + difference_op, + stream, + debug_synchronous); } public: @@ -234,6 +216,8 @@ public: * Its `result_type` is convertible to a type in `OutputIteratorT`'s set of * `value_types`. * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -265,13 +249,14 @@ public: */ template + typename DifferenceOpT = cub::Difference, + typename NumItemsT = std::uint32_t> static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, - std::size_t num_items, + NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0, bool debug_synchronous = false) @@ -353,6 +338,8 @@ public: * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -380,12 +367,13 @@ public: * be printed to the console. Default is `false`. */ template + typename DifferenceOpT = cub::Difference, + typename NumItemsT = std::uint32_t> static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, - std::size_t num_items, + NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0, bool debug_synchronous = false) @@ -477,6 +465,8 @@ public: * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -508,13 +498,14 @@ public: */ template + typename DifferenceOpT = cub::Difference, + typename NumItemsT = std::uint32_t> static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, - std::size_t num_items, + NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0, bool debug_synchronous = false) @@ -586,6 +577,8 @@ public: * Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s * set of `value_types`. * + * @tparam NumItemsT **[inferred]** Type of num_items + * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work @@ -613,12 +606,13 @@ public: * printed to the console. Default is `false`. */ template + typename DifferenceOpT = cub::Difference, + typename NumItemsT = std::uint32_t> static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(void *d_temp_storage, std::size_t &temp_storage_bytes, RandomAccessIteratorT d_input, - std::size_t num_items, + NumItemsT num_items, DifferenceOpT difference_op = {}, cudaStream_t stream = 0, bool debug_synchronous = false) diff --git a/test/test_device_adjacent_difference.cu b/test/test_device_adjacent_difference.cu index 8e949bb115..ee26bc9ed8 100644 --- a/test/test_device_adjacent_difference.cu +++ b/test/test_device_adjacent_difference.cu @@ -91,12 +91,13 @@ struct CustomDifference template + typename DifferenceOpT, + typename NumItemsT> void AdjacentDifference(void *temp_storage, std::size_t &temp_storage_bytes, IteratorT it, DifferenceOpT difference_op, - std::size_t num_items) + NumItemsT num_items) { const bool is_default_op_in_use = std::is_same::value; @@ -151,13 +152,14 @@ void AdjacentDifference(void *temp_storage, template + typename DifferenceOpT, + typename NumItemsT> void AdjacentDifferenceCopy(void *temp_storage, std::size_t &temp_storage_bytes, InputIteratorT input, OutputIteratorT output, DifferenceOpT difference_op, - std::size_t num_items) + NumItemsT num_items) { const bool is_default_op_in_use = std::is_same::value; @@ -214,10 +216,11 @@ void AdjacentDifferenceCopy(void *temp_storage, template + typename DifferenceOpT, + typename NumItemsT> void AdjacentDifference(IteratorT it, DifferenceOpT difference_op, - std::size_t num_items) + NumItemsT num_items) { std::size_t temp_storage_bytes {}; @@ -239,11 +242,12 @@ void AdjacentDifference(IteratorT it, template + typename DifferenceOpT, + typename NumItemsT> void AdjacentDifferenceCopy(InputIteratorT input, OutputIteratorT output, DifferenceOpT difference_op, - std::size_t num_items) + NumItemsT num_items) { std::size_t temp_storage_bytes{}; @@ -283,8 +287,9 @@ bool CheckResult(FirstIteratorT first_begin, template -void TestCopy(std::size_t elements, DifferenceOpT difference_op) + typename DifferenceOpT, + typename NumItemsT> +void TestCopy(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector input(elements); thrust::tabulate(input.begin(), @@ -331,8 +336,9 @@ void TestCopy(std::size_t elements, DifferenceOpT difference_op) template -void TestIteratorCopy(std::size_t elements, DifferenceOpT difference_op) + typename DifferenceOpT, + typename NumItemsT> +void TestIteratorCopy(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector input(elements); thrust::tabulate(input.begin(), @@ -375,8 +381,9 @@ void TestIteratorCopy(std::size_t elements, DifferenceOpT difference_op) template -void TestCopy(std::size_t elements) + typename OutputT, + typename NumItemsT> +void TestCopy(NumItemsT elements) { TestCopy(elements, cub::Difference{}); TestCopy(elements, CustomDifference{}); @@ -386,7 +393,8 @@ void TestCopy(std::size_t elements) } -void TestCopy(std::size_t elements) +template +void TestCopy(NumItemsT elements) { TestCopy(elements); TestCopy(elements); @@ -394,8 +402,9 @@ void TestCopy(std::size_t elements) template -void Test(std::size_t elements, DifferenceOpT difference_op) + typename DifferenceOpT, + typename NumItemsT> +void Test(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector data(elements); thrust::tabulate(data.begin(), @@ -439,8 +448,9 @@ void Test(std::size_t elements, DifferenceOpT difference_op) template -void TestIterators(std::size_t elements, DifferenceOpT difference_op) + typename DifferenceOpT, + typename NumItemsT> +void TestIterators(NumItemsT elements, DifferenceOpT difference_op) { thrust::device_vector data(elements); thrust::tabulate(data.begin(), @@ -480,8 +490,9 @@ void TestIterators(std::size_t elements, DifferenceOpT difference_op) } -template -void Test(std::size_t elements) +template +void Test(NumItemsT elements) { Test(elements, cub::Difference{}); Test(elements, CustomDifference{}); @@ -491,17 +502,24 @@ void Test(std::size_t elements) } -void Test(std::size_t elements) +template +void Test(NumItemsT elements) { - Test(elements); - Test(elements); - Test(elements); + Test(elements); + Test(elements); + Test(elements); } -template -void TestFancyIterators(std::size_t elements) +template +void TestFancyIterators(NumItemsT elements) { + if (elements == 0) + { + return; + } + thrust::counting_iterator count_iter(ValueT{1}); thrust::device_vector output(elements, ValueT{42}); @@ -510,7 +528,7 @@ void TestFancyIterators(std::size_t elements) cub::Difference{}, elements); AssertEquals(elements, - static_cast( + static_cast( thrust::count(output.begin(), output.end(), ValueT(1)))); thrust::fill(output.begin(), output.end(), ValueT{}); @@ -519,7 +537,7 @@ void TestFancyIterators(std::size_t elements) cub::Difference{}, elements); AssertEquals(elements - 1, - static_cast( + static_cast( thrust::count(output.begin(), output.end() - 1, static_cast(-1)))); @@ -532,7 +550,7 @@ void TestFancyIterators(std::size_t elements) cub::Difference{}, elements); AssertEquals(elements, - static_cast( + static_cast( thrust::count(output.begin(), output.end(), ValueT{}))); thrust::fill(output.begin(), output.end(), ValueT{}); @@ -541,7 +559,7 @@ void TestFancyIterators(std::size_t elements) cub::Difference{}, elements); AssertEquals(elements, - static_cast( + static_cast( thrust::count(output.begin(), output.end(), ValueT{}))); AdjacentDifferenceCopy(const_iter, @@ -556,13 +574,15 @@ void TestFancyIterators(std::size_t elements) } -void TestFancyIterators(std::size_t elements) +template +void TestFancyIterators(NumItemsT elements) { - TestFancyIterators(elements); + TestFancyIterators(elements); } -void TestSize(std::size_t elements) +template +void TestSize(NumItemsT elements) { Test(elements); TestCopy(elements); @@ -634,10 +654,10 @@ int main(int argc, char** argv) // Initialize device CubDebugExit(args.DeviceInit()); - Test(0); + TestSize(0); for (std::size_t power_of_two = 2; power_of_two < 20; power_of_two += 2) { - Test(1ull << power_of_two); + TestSize(1ull << power_of_two); } TestAdjacentDifferenceWithBigIndexes(); From 91d7d6b8738c2c37158d4ed56c16c47fa5cf7181 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 5 May 2022 00:36:09 +0400 Subject: [PATCH 2/2] Extract ChooseOffset --- cub/detail/choose_offset.cuh | 62 +++++++++++++++++++++++ cub/device/device_adjacent_difference.cuh | 16 ++---- cub/device/device_radix_sort.cuh | 26 ++-------- 3 files changed, 70 insertions(+), 34 deletions(-) create mode 100644 cub/detail/choose_offset.cuh diff --git a/cub/detail/choose_offset.cuh b/cub/detail/choose_offset.cuh new file mode 100644 index 0000000000..ed703e26c9 --- /dev/null +++ b/cub/detail/choose_offset.cuh @@ -0,0 +1,62 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include +#include + +CUB_NAMESPACE_BEGIN + +namespace detail +{ + +/** + * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and + * selects the offset type based on it. + */ +template +struct ChooseOffsetT +{ + // NumItemsT must be an integral type (but not bool). + static_assert( + std::is_integral::value && + !std::is_same::type, bool>::value, + "NumItemsT must be an integral type, but not bool"); + + // Unsigned integer type for global offsets. + using Type = typename std::conditional::type; +}; + +} // namespace detail + +CUB_NAMESPACE_END + diff --git a/cub/device/device_adjacent_difference.cuh b/cub/device/device_adjacent_difference.cuh index 56df25329b..c030e6fe43 100644 --- a/cub/device/device_adjacent_difference.cuh +++ b/cub/device/device_adjacent_difference.cuh @@ -27,9 +27,10 @@ #pragma once -#include "../config.cuh" -#include "../util_namespace.cuh" -#include "dispatch/dispatch_adjacent_difference.cuh" +#include +#include +#include +#include #include #include @@ -114,14 +115,7 @@ private: cudaStream_t stream, bool debug_synchronous) { - static_assert( - std::is_integral::value && - !std::is_same::type, bool>::value, - "NumItemsT must be an integral type, but not bool"); - - using OffsetT = std::conditional_t; + using OffsetT = typename detail::ChooseOffsetT::Type; using DispatchT = DispatchAdjacentDifference -#include -#include - -#include "dispatch/dispatch_radix_sort.cuh" -#include "../config.cuh" +#include +#include +#include CUB_NAMESPACE_BEGIN -namespace detail { -/** ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and - * selects the offset type based on it. */ -template -struct ChooseOffsetT -{ - // NumItemsT must be an integral type (but not bool). - static_assert(std::is_integral::value && - !std::is_same::type, bool>::value, - "NumItemsT must be an integral type, but not bool"); - - // Unsigned integer type for global offsets. - using Type = typename std::conditional::type; -}; - -} // namespace detail - /** * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) * \ingroup SingleModule