Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Add libcu++ dependency; initial round of NV_IF_TARGET ports. #448

Merged
merged 7 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <
typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentHistogram
{
//---------------------------------------------------------------------
Expand Down Expand Up @@ -562,15 +562,10 @@ struct AgentHistogram
is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);

// Accumulate samples
#if CUB_PTX_ARCH >= 120
if (prefer_smem)
AccumulateSmemPixels(samples, is_valid);
else
AccumulateGmemPixels(samples, is_valid);
#else
AccumulateGmemPixels(samples, is_valid);
#endif

}


Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ struct AgentRle
// Constants
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
WARP_THREADS = CUB_WARP_THREADS(0),
BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD,
Expand Down
3 changes: 1 addition & 2 deletions cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ struct AgentSegmentFixup
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

// Whether or not do fixup using RLE + global atomics
USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
(std::is_same<ValueT, float>::value ||
USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
std::is_same<ValueT, int>::value ||
std::is_same<ValueT, unsigned int>::value ||
std::is_same<ValueT, unsigned long long>::value),
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ template <
typename OffsetT, ///< Signed integer type for sequence offsets
bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1
bool HAS_BETA, ///< Whether the input parameter \p beta is 0
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused)
struct AgentSpmv
{
//---------------------------------------------------------------------
Expand Down
33 changes: 24 additions & 9 deletions cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/warp/warp_store.cuh>

#include <nv/target>

#include <thrust/system/cuda/detail/core/util.h>


Expand Down Expand Up @@ -108,6 +110,23 @@ class AgentSubWarpSort
{
template <typename T>
__device__ bool operator()(T lhs, T rhs)
{
return this->impl(lhs, rhs);
}

#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ bool operator()(__half lhs, __half rhs)
{
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return this->impl(lhs, rhs);),
(return this->impl(__half2float(lhs), __half2float(rhs));));
}
#endif

private:
template <typename T>
__device__ bool impl(T lhs, T rhs)
{
if (IS_DESCENDING)
{
Expand All @@ -118,19 +137,15 @@ class AgentSubWarpSort
return lhs < rhs;
}
}

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
__device__ bool operator()(__half lhs, __half rhs)
{
return (*this)(__half2float(lhs), __half2float(rhs));
}
#endif
};

#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
#if defined(__CUDA_FP16_TYPES_EXIST__)
__device__ static bool equal(__half lhs, __half rhs)
{
return __half2float(lhs) == __half2float(rhs);
// Need to explicitly cast to float for SM <= 52.
NV_IF_TARGET(NV_PROVIDES_SM_53,
(return lhs == rhs;),
(return __half2float(lhs) == __half2float(rhs);));
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -666,11 +666,11 @@ template <
typename T,
typename ScanOpT,
typename ScanTileStateT,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
struct TilePrefixCallbackOp
{
// Parameterized warp reduce
typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;

// Temporary storage type
struct _TempStorage
Expand Down
7 changes: 3 additions & 4 deletions cub/block/block_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@

CUB_NAMESPACE_BEGIN


/**
* @brief BlockAdjacentDifference provides
* [<em>collective</em>](index.html#sec0) methods for computing the
Expand Down Expand Up @@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
*/
template <typename T,
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int LEGACY_PTX_ARCH = 0>
class BlockAdjacentDifference
{
private:
Expand Down
4 changes: 2 additions & 2 deletions cub/block/block_discontinuity.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
Expand Down Expand Up @@ -107,7 +107,7 @@ template <
int BLOCK_DIM_X,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockDiscontinuity
{
private:
Expand Down
9 changes: 4 additions & 5 deletions cub/block/block_exchange.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
* \tparam WARP_TIME_SLICING <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - It is commonplace for blocks of threads to rearrange data items between
Expand Down Expand Up @@ -114,7 +114,7 @@ template <
bool WARP_TIME_SLICING = false,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockExchange
{
private:
Expand All @@ -129,11 +129,11 @@ private:
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0),
SMEM_BANKS = 1 << LOG_SMEM_BANKS,

TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
Expand Down Expand Up @@ -1126,4 +1126,3 @@ public:


CUB_NAMESPACE_END

20 changes: 4 additions & 16 deletions cub/block/block_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ enum BlockHistogramAlgorithm
* \tparam ALGORITHM <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
Expand Down Expand Up @@ -160,7 +160,7 @@ template <
BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockHistogram
{
private:
Expand All @@ -176,27 +176,15 @@ private:
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};

/**
* Ensure the template parameterization meets the requirements of the
* targeted device architecture. BLOCK_HISTO_ATOMIC can only be used
* on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used
* regardless.
*/
static const BlockHistogramAlgorithm SAFE_ALGORITHM =
((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
BLOCK_HISTO_SORT :
ALGORITHM;

/// Internal specialization.
using InternalBlockHistogram =
cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_HISTO_SORT,
cub::detail::conditional_t<ALGORITHM == BLOCK_HISTO_SORT,
BlockHistogramSort<T,
BLOCK_DIM_X,
ITEMS_PER_THREAD,
BINS,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>,
BLOCK_DIM_Z>,
BlockHistogramAtomic<BINS>>;

/// Shared memory storage layout type for BlockHistogram
Expand Down
14 changes: 7 additions & 7 deletions cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ enum BlockLoadAlgorithm
* \tparam WARP_TIME_SLICING <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* - The BlockLoad class provides a single data movement abstraction that can be specialized
Expand Down Expand Up @@ -638,7 +638,7 @@ template <
BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockLoad
{
private:
Expand Down Expand Up @@ -860,7 +860,7 @@ private:
struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
{
// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down Expand Up @@ -928,14 +928,14 @@ private:
{
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
WARP_THREADS = CUB_WARP_THREADS(0)
};

// Assert BLOCK_THREADS must be a multiple of WARP_THREADS
CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");

// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down Expand Up @@ -1003,14 +1003,14 @@ private:
{
enum
{
WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
WARP_THREADS = CUB_WARP_THREADS(0)
};

// Assert BLOCK_THREADS must be a multiple of WARP_THREADS
CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");

// BlockExchange utility type for keys
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;

/// Shared memory storage layout type
struct _TempStorage : BlockExchange::TempStorage
Expand Down
18 changes: 8 additions & 10 deletions cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ struct BlockRadixRankEmptyCallback
* \tparam SMEM_CONFIG <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
* \tparam BLOCK_DIM_Y <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
* \tparam BLOCK_DIM_Z <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
* \tparam PTX_ARCH <b>[optional]</b> \ptxversion
* \tparam LEGACY_PTX_ARCH <b>[optional]</b> Unused.
*
* \par Overview
* Blah...
Expand Down Expand Up @@ -138,12 +138,12 @@ template <
int BLOCK_DIM_X,
int RADIX_BITS,
bool IS_DESCENDING,
bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false,
bool MEMOIZE_OUTER_SCAN = true,
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockRadixRank
{
private:
Expand All @@ -168,7 +168,7 @@ private:

RADIX_DIGITS = 1 << RADIX_BITS,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

Expand Down Expand Up @@ -203,8 +203,7 @@ private:
BLOCK_DIM_X,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>
BLOCK_DIM_Z>
BlockScan;


Expand Down Expand Up @@ -508,7 +507,7 @@ template <
BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
int BLOCK_DIM_Y = 1,
int BLOCK_DIM_Z = 1,
int PTX_ARCH = CUB_PTX_ARCH>
int LEGACY_PTX_ARCH = 0>
class BlockRadixRankMatch
{
private:
Expand All @@ -527,7 +526,7 @@ private:

RADIX_DIGITS = 1 << RADIX_BITS,

LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
WARP_THREADS = 1 << LOG_WARP_THREADS,
WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,

Expand Down Expand Up @@ -558,8 +557,7 @@ private:
BLOCK_THREADS,
INNER_SCAN_ALGORITHM,
BLOCK_DIM_Y,
BLOCK_DIM_Z,
PTX_ARCH>
BLOCK_DIM_Z>
BlockScanT;


Expand Down
Loading