Skip to content

Commit

Permalink
- 1.0.1 snap
Browse files Browse the repository at this point in the history
- Updated radix sort test
- Updated dox

Former-commit-id: 41fb04c
  • Loading branch information
dumerrill committed Aug 9, 2013
1 parent 1504496 commit 215f377
Show file tree
Hide file tree
Showing 166 changed files with 400 additions and 286 deletions.
27 changes: 20 additions & 7 deletions CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
//-----------------------------------------------------------------------------

1.0.1 ...
1.0.1 08/08/2013
- New collective interface idiom (specialize::construct::invoke).
- Added best-in-class DeviceRadixSort. Implements short-circuiting for homogenous digit passes.
- Added best-in-class DeviceScan. Implements single-pass "adaptive-lookback" strategy.
- Significantly improved documentation (with example code snippets)
- More extensive regression test suit for aggressively testing collective variants
- Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types)
- Improved support for Kepler SHFL (collective ops now use SHFL for types larger than 32b)
- Better code generation for 64-bit addressing within BlockLoad/BlockStore
- DeviceHistogram now supports histograms of arbitrary bins
- Misc. fixes
- Workarounds for SM10 codegen issues in uncommonly-used WarpScan/Reduce specializations
- Updates to accommodate CUDA 5.5 dynamic parallelism


//-----------------------------------------------------------------------------

0.9.4 05/07/2013
0.9.4 05/07/2013

- Fixed compilation errors for SM10-SM13
- Fixed compilation errors for some WarpScan entrypoints on SM30+
Expand All @@ -26,7 +39,7 @@

//-----------------------------------------------------------------------------

0.9.2 04/04/2013
0.9.2 04/04/2013

- Added WarpReduce. WarpReduce uses the SHFL instruction when applicable.
BlockReduce now uses this WarpReduce instead of implementing its own.
Expand All @@ -35,7 +48,7 @@

//-----------------------------------------------------------------------------

0.9.1 03/09/2013
0.9.1 03/09/2013

- Fix for ambiguity in BlockScan::Reduce() between generic reduction and
summation. Summation entrypoints are now called ::Sum(), similar to the
Expand All @@ -44,9 +57,9 @@

//-----------------------------------------------------------------------------

0.9.0 03/07/2013
0.9.0 03/07/2013

- Intial "preview" release. CUB is the first durable, high-performance library
- Intial "preview" release. CUB is the first durable, high-performance library
of cooperative block-level, warp-level, and thread-level primitives for CUDA
kernel programming. More primitives and examples coming soon!
68 changes: 39 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,48 +7,58 @@ CUB is a library of cooperative threadblock primitives and other high performanc
utilities for CUDA kernel programming. CUB enhances productivity, performance, and portability by
providing an abstraction layer over complex threadblock, warp, and thread-level operations.

![SIMT abstraction layer](http://nvlabs.github.com/cub/simt_abstraction.png)
![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)

<br><hr>
<h3>Recent news</h3>
<h3>Releases</h3>

| Date | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Topic&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Description |
| ---- | ------- | ----------- |
| 04/04/2013 | [CUB v0.9.2 Update Release](https://github.com/NVlabs/cub/archive/0.9.2.zip) | Minor cosmetic, feature, and compilation updates. See the [change-log](https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT) for further details. |
| 03/06/2013 | [CUB v0.9 Preview Release](https://github.com/NVlabs/cub/archive/0.9.zip) | CUB is the first durable, high-performance library of cooperative threadblock, warp, and thread primitives for CUDA kernel programming. More primitives and examples coming soon! |
See [CUB Project Website](http://nvlabs.github.com/cub) for more information.

| Date | Version |
| ---- | ------- |
| 08/08/2013 | [CUB v1.0.1 Primary Release](https://github.com/NVlabs/cub/archive/1.0.1.zip) |
| 05/07/2013 | [CUB v0.9.4 Update Release](https://github.com/NVlabs/cub/archive/0.9.4.zip) |
| 04/04/2013 | [CUB v0.9.2 Update Release](https://github.com/NVlabs/cub/archive/0.9.2.zip) |
| 03/09/2013 | [CUB v0.9.1 Update Release](https://github.com/NVlabs/cub/archive/0.9.1.zip) |
| 03/07/2013 | [CUB v0.9.0 Preview Release](https://github.com/NVlabs/cub/archive/0.9.zip) |

<br><hr>
<h3>A Simple Example</h3>

```C++
#include <cub/cub.cuh>

// An exclusive prefix sum CUDA kernel (for a single-threadblock grid)
template <
int BLOCK_THREADS, // Threads per threadblock
int ITEMS_PER_THREAD, // Items per thread
typename T> // Data type
__global__ void PrefixSumKernel(T *d_in, T *d_out)
// Block-sorting CUDA kernel
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
using namespace cub;

// Parameterize BlockScan for the current execution context
typedef BlockScan<T, BLOCK_THREADS> BlockScan;

// The shared memory needed by BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;

// A segment of data items per thread
T data[ITEMS_PER_THREAD];

// Load a tile of data using vector-load instructions
LoadBlockedVectorized(data, d_in, 0);
using namespace cub;

// Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
// owning 16 integer items each
typedef BlockRadixSort<int, 128, 16> BlockRadixSort;
typedef BlockLoad<int*, 128, 16, BLOCK_LOAD_TRANSPOSE> BlockLoad;
typedef BlockStore<int*, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;

// Perform an exclusive prefix sum across the tile of data
BlockScan::ExclusiveSum(temp_storage, data, data);
// Allocate shared memory
__shared__ union {
typename BlockRadixSort::TempStorage sort;
typename BlockLoad::TempStorage load;
typename BlockStore::TempStorage store;
} temp_storage;

int block_offset = blockIdx.x * (128 * 16); // Offset for this block's ment

// Obtain a segment of 2048 consecutive keys that are blocked across threads
int thread_keys[16];
BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads();

// Collectively sort the keys
BlockRadixSort(temp_storage.sort).Sort(thread_keys);
__syncthreads();

// Store a tile of data using vector-load instructions
StoreBlockedVectorized(data, d_out, 0);
// Store the sorted segment
BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
}
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* BlockRadixSortScatterTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
* BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
*/


Expand Down Expand Up @@ -66,7 +66,7 @@ enum RadixSortScatterAlgorithm


/**
* Tuning policy for BlockRadixSortScatterTiles
* Tuning policy for BlockRadixSortDownsweepTiles
*/
template <
int _BLOCK_THREADS, ///< The number of threads per CTA
Expand All @@ -79,7 +79,7 @@ template <
RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use
cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct BlockRadixSortScatterTilesPolicy
struct BlockRadixSortDownsweepTilesPolicy
{
enum
{
Expand All @@ -97,7 +97,7 @@ struct BlockRadixSortScatterTilesPolicy
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG;

typedef BlockRadixSortScatterTilesPolicy<
typedef BlockRadixSortDownsweepTilesPolicy<
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
Expand All @@ -120,11 +120,11 @@ struct BlockRadixSortScatterTilesPolicy
* a range of input tiles.
*/
template <
typename BlockRadixSortScatterTilesPolicy,
typename BlockRadixSortDownsweepTilesPolicy,
typename Key,
typename Value,
typename SizeT>
struct BlockRadixSortScatterTiles
struct BlockRadixSortDownsweepTiles
{
//---------------------------------------------------------------------
// Type definitions and constants
Expand All @@ -136,19 +136,19 @@ struct BlockRadixSortScatterTiles
static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;

static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRadixSortScatterTilesPolicy::LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortScatterTilesPolicy::LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRadixSortScatterTilesPolicy::INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRadixSortScatterTilesPolicy::SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = BlockRadixSortScatterTilesPolicy::SMEM_CONFIG;
static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;

enum
{
BLOCK_THREADS = BlockRadixSortScatterTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockRadixSortScatterTilesPolicy::ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING = BlockRadixSortScatterTilesPolicy::EXCHANGE_TIME_SLICING,
RADIX_BITS = BlockRadixSortScatterTilesPolicy::RADIX_BITS,
MEMOIZE_OUTER_SCAN = BlockRadixSortScatterTilesPolicy::MEMOIZE_OUTER_SCAN,
BLOCK_THREADS = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
RADIX_BITS = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
MEMOIZE_OUTER_SCAN = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

RADIX_DIGITS = 1 << RADIX_BITS,
Expand Down Expand Up @@ -617,7 +617,7 @@ struct BlockRadixSortScatterTiles
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortScatterTiles(
__device__ __forceinline__ BlockRadixSortDownsweepTiles(
TempStorage &temp_storage,
SizeT bin_offset,
Key *d_keys_in,
Expand All @@ -640,7 +640,7 @@ struct BlockRadixSortScatterTiles
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortScatterTiles(
__device__ __forceinline__ BlockRadixSortDownsweepTiles(
TempStorage &temp_storage,
SizeT num_items,
SizeT *d_spine,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* BlockRadixSortHistoTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
* BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
*/

#pragma once
Expand All @@ -49,14 +49,14 @@ namespace cub {
******************************************************************************/

/**
* Tuning policy for BlockRadixSortHistoTiles
* Tuning policy for BlockRadixSortUpsweepTiles
*/
template <
int _BLOCK_THREADS, ///< The number of threads per CTA
int _ITEMS_PER_THREAD, ///< The number of items to load per thread per tile
PtxLoadModifier _LOAD_MODIFIER, ///< Load cache-modifier
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct BlockRadixSortHistoTilesPolicy
struct BlockRadixSortUpsweepTilesPolicy
{
enum
{
Expand All @@ -68,7 +68,7 @@ struct BlockRadixSortHistoTilesPolicy

static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;

typedef BlockRadixSortHistoTilesPolicy<
typedef BlockRadixSortUpsweepTilesPolicy<
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_MODIFIER,
Expand All @@ -81,15 +81,15 @@ struct BlockRadixSortHistoTilesPolicy
******************************************************************************/

/**
* \brief BlockRadixSortHistoTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
* \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
*
* Computes radix digit histograms over a range of input tiles.
*/
template <
typename BlockRadixSortHistoTilesPolicy,
typename BlockRadixSortUpsweepTilesPolicy,
typename Key,
typename SizeT>
struct BlockRadixSortHistoTiles
struct BlockRadixSortUpsweepTiles
{

//---------------------------------------------------------------------
Expand All @@ -104,13 +104,13 @@ struct BlockRadixSortHistoTiles
// Integer type for packing DigitCounters into columns of shared memory banks
typedef unsigned int PackedCounter;

static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortHistoTilesPolicy::LOAD_MODIFIER;
static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;

enum
{
RADIX_BITS = BlockRadixSortHistoTilesPolicy::RADIX_BITS,
BLOCK_THREADS = BlockRadixSortHistoTilesPolicy::BLOCK_THREADS,
KEYS_PER_THREAD = BlockRadixSortHistoTilesPolicy::ITEMS_PER_THREAD,
RADIX_BITS = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
BLOCK_THREADS = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
KEYS_PER_THREAD = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,

RADIX_DIGITS = 1 << RADIX_BITS,

Expand Down Expand Up @@ -192,7 +192,7 @@ struct BlockRadixSortHistoTiles

// BucketKeys
static __device__ __forceinline__ void BucketKeys(
BlockRadixSortHistoTiles &cta,
BlockRadixSortUpsweepTiles &cta,
UnsignedBits keys[KEYS_PER_THREAD])
{
cta.Bucket(keys[COUNT]);
Expand All @@ -202,7 +202,7 @@ struct BlockRadixSortHistoTiles
}

// ProcessTiles
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortHistoTiles &cta, SizeT block_offset)
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
{
// Next
Iterate<1, HALF>::ProcessTiles(cta, block_offset);
Expand All @@ -215,10 +215,10 @@ struct BlockRadixSortHistoTiles
struct Iterate<MAX, MAX>
{
// BucketKeys
static __device__ __forceinline__ void BucketKeys(BlockRadixSortHistoTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}

// ProcessTiles
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortHistoTiles &cta, SizeT block_offset)
static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
{
cta.ProcessFullTile(block_offset);
}
Expand Down Expand Up @@ -392,7 +392,7 @@ struct BlockRadixSortHistoTiles
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortHistoTiles(
__device__ __forceinline__ BlockRadixSortUpsweepTiles(
TempStorage &temp_storage,
Key *d_keys_in,
int current_bit)
Expand Down
Loading

0 comments on commit 215f377

Please sign in to comment.