Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Alternative approach to 64-bit indexing in adjacent difference #466

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions cub/detail/choose_offset.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#include <cstdint>
#include <type_traits>

CUB_NAMESPACE_BEGIN

namespace detail
{

/**
* ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
* selects the offset type based on it.
*/
template <typename NumItemsT>
struct ChooseOffsetT
{
// NumItemsT must be an integral type (but not bool).
static_assert(
std::is_integral<NumItemsT>::value &&
!std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
"NumItemsT must be an integral type, but not bool");

// Unsigned integer type for global offsets.
using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
std::uint32_t,
unsigned long long>::type;
};

} // namespace detail

CUB_NAMESPACE_END

94 changes: 41 additions & 53 deletions cub/device/device_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@

#pragma once

#include "../config.cuh"
#include "../util_namespace.cuh"
#include "dispatch/dispatch_adjacent_difference.cuh"
#include <cub/config.cuh>
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
#include <cub/util_namespace.cuh>

#include <thrust/detail/integer_traits.h>
#include <thrust/detail/cstdint.h>
Expand Down Expand Up @@ -98,9 +99,9 @@ CUB_NAMESPACE_BEGIN
struct DeviceAdjacentDifference
{
private:

template <bool in_place,
bool read_left,
typename NumItemsT,
typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT>
Expand All @@ -109,53 +110,28 @@ private:
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
NumItemsT num_items,
DifferenceOpT difference_op,
cudaStream_t stream,
bool debug_synchronous)
{
const auto uint64_threshold = static_cast<std::size_t>(
THRUST_NS_QUALIFIER::detail::integer_traits<
THRUST_NS_QUALIFIER::detail::int32_t>::const_max);

if (num_items <= uint64_threshold)
{
using OffsetT = std::uint32_t;
using DispatchT = DispatchAdjacentDifference<InputIteratorT,
OutputIteratorT,
DifferenceOpT,
OffsetT,
in_place,
read_left>;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

return DispatchT::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input,
d_output,
static_cast<OffsetT>(num_items),
difference_op,
stream,
debug_synchronous);
}
else
{
using OffsetT = std::uint64_t;
using DispatchT = DispatchAdjacentDifference<InputIteratorT,
OutputIteratorT,
DifferenceOpT,
OffsetT,
in_place,
read_left>;
using DispatchT = DispatchAdjacentDifference<InputIteratorT,
OutputIteratorT,
DifferenceOpT,
OffsetT,
in_place,
read_left>;

return DispatchT::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input,
d_output,
static_cast<OffsetT>(num_items),
difference_op,
stream,
debug_synchronous);
}
return DispatchT::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input,
d_output,
static_cast<OffsetT>(num_items),
difference_op,
stream,
debug_synchronous);
}

public:
Expand Down Expand Up @@ -234,6 +210,8 @@ public:
* Its `result_type` is convertible to a type in `OutputIteratorT`'s set of
* `value_types`.
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -265,13 +243,14 @@ public:
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = cub::Difference>
typename DifferenceOpT = cub::Difference,
typename NumItemsT = std::uint32_t>
static CUB_RUNTIME_FUNCTION cudaError_t
SubtractLeftCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
Expand Down Expand Up @@ -353,6 +332,8 @@ public:
* Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
* set of `value_types`.
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -380,12 +361,13 @@ public:
* be printed to the console. Default is `false`.
*/
template <typename RandomAccessIteratorT,
typename DifferenceOpT = cub::Difference>
typename DifferenceOpT = cub::Difference,
typename NumItemsT = std::uint32_t>
static CUB_RUNTIME_FUNCTION cudaError_t
SubtractLeft(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
Expand Down Expand Up @@ -477,6 +459,8 @@ public:
* Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
* set of `value_types`.
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -508,13 +492,14 @@ public:
*/
template <typename InputIteratorT,
typename OutputIteratorT,
typename DifferenceOpT = cub::Difference>
typename DifferenceOpT = cub::Difference,
typename NumItemsT = std::uint32_t>
static CUB_RUNTIME_FUNCTION cudaError_t
SubtractRightCopy(void *d_temp_storage,
std::size_t &temp_storage_bytes,
InputIteratorT d_input,
OutputIteratorT d_output,
std::size_t num_items,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
Expand Down Expand Up @@ -586,6 +571,8 @@ public:
* Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
* set of `value_types`.
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -613,12 +600,13 @@ public:
* printed to the console. Default is `false`.
*/
template <typename RandomAccessIteratorT,
typename DifferenceOpT = cub::Difference>
typename DifferenceOpT = cub::Difference,
typename NumItemsT = std::uint32_t>
static CUB_RUNTIME_FUNCTION cudaError_t
SubtractRight(void *d_temp_storage,
std::size_t &temp_storage_bytes,
RandomAccessIteratorT d_input,
std::size_t num_items,
NumItemsT num_items,
DifferenceOpT difference_op = {},
cudaStream_t stream = 0,
bool debug_synchronous = false)
Expand Down
26 changes: 3 additions & 23 deletions cub/device/device_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,12 @@

#pragma once

#include <stdio.h>
#include <iterator>
#include <type_traits>

#include "dispatch/dispatch_radix_sort.cuh"
#include "../config.cuh"
#include <cub/config.cuh>
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_radix_sort.cuh>

CUB_NAMESPACE_BEGIN

namespace detail {
/** ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
* selects the offset type based on it. */
template <typename NumItemsT>
struct ChooseOffsetT
{
// NumItemsT must be an integral type (but not bool).
static_assert(std::is_integral<NumItemsT>::value &&
!std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
"NumItemsT must be an integral type, but not bool");

// Unsigned integer type for global offsets.
using Type = typename std::conditional<sizeof(NumItemsT) <= 4, uint32_t, unsigned long long>::type;
};

} // namespace detail

/**
* \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
* \ingroup SingleModule
Expand Down
Loading