- 1.0.1 snap

- Updated radix sort test - Updated dox Former-commit-id: 41fb04c
brycelelbach · Aug 9, 2013 · 215f377 · 215f377
1 parent 1504496
commit 215f377
Show file tree

Hide file tree

Showing 166 changed files with 400 additions and 286 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,10 +1,23 @@
 //-----------------------------------------------------------------------------
 
-1.0.1	...
+1.0.1    08/08/2013
+    - New collective interface idiom (specialize::construct::invoke).
+    - Added best-in-class DeviceRadixSort.  Implements short-circuiting for homogenous digit passes.
+    - Added best-in-class DeviceScan.  Implements single-pass "adaptive-lookback" strategy.
+    - Significantly improved documentation (with example code snippets) 
+    - More extensive regression test suit for aggressively testing collective variants
+    - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types)
+    - Improved support for Kepler SHFL (collective ops now use SHFL for types larger than 32b)
+    - Better code generation for 64-bit addressing within BlockLoad/BlockStore
+    - DeviceHistogram now supports histograms of arbitrary bins
+    - Misc. fixes
+      - Workarounds for SM10 codegen issues in uncommonly-used WarpScan/Reduce specializations
+      - Updates to accommodate CUDA 5.5 dynamic parallelism   
+
 
 //-----------------------------------------------------------------------------
 
-0.9.4	05/07/2013
+0.9.4    05/07/2013
 
     - Fixed compilation errors for SM10-SM13
     - Fixed compilation errors for some WarpScan entrypoints on SM30+
@@ -26,7 +39,7 @@
 
 //-----------------------------------------------------------------------------
 
-0.9.2	04/04/2013
+0.9.2    04/04/2013
 
     - Added WarpReduce.  WarpReduce uses the SHFL instruction when applicable. 
       BlockReduce now uses this WarpReduce instead of implementing its own.
@@ -35,7 +48,7 @@
 
 //-----------------------------------------------------------------------------
 
-0.9.1	03/09/2013
+0.9.1    03/09/2013
 
     - Fix for ambiguity in BlockScan::Reduce() between generic reduction and 
       summation.  Summation entrypoints are now called ::Sum(), similar to the 
@@ -44,9 +57,9 @@
 
 //-----------------------------------------------------------------------------
 
-0.9.0	03/07/2013	
+0.9.0    03/07/2013    
 
-    - Intial "preview" release.	CUB is the first durable, high-performance library 
+    - Intial "preview" release.    CUB is the first durable, high-performance library 
       of cooperative block-level, warp-level, and thread-level primitives for CUDA 
       kernel programming.  More primitives and examples coming soon!
-	
+    
diff --git a/README.md b/README.md
@@ -7,48 +7,58 @@ CUB is a library of cooperative threadblock primitives and other high performanc
 utilities for CUDA kernel programming.  CUB enhances productivity, performance, and portability by
 providing an abstraction layer over complex threadblock, warp, and thread-level operations.
 
-![SIMT abstraction layer](http://nvlabs.github.com/cub/simt_abstraction.png)
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
 
 <br><hr>
-<h3>Recent news</h3>
+<h3>Releases</h3>
 
-| Date | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Topic&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Description |
-| ---- | ------- | ----------- |
-| 04/04/2013 | [CUB v0.9.2 Update Release](https://github.com/NVlabs/cub/archive/0.9.2.zip) | Minor cosmetic, feature, and compilation updates.  See the [change-log](https://github.com/NVlabs/cub/blob/master/CHANGE_LOG.TXT) for further details. |
-| 03/06/2013 | [CUB v0.9 Preview Release](https://github.com/NVlabs/cub/archive/0.9.zip) | CUB is the first durable, high-performance library of cooperative threadblock, warp, and thread primitives for CUDA kernel programming. More primitives and examples coming soon! |
+See [CUB Project Website](http://nvlabs.github.com/cub) for more information.
+
+| Date | Version |
+| ---- | ------- |
+| 08/08/2013 | [CUB v1.0.1 Primary Release](https://github.com/NVlabs/cub/archive/1.0.1.zip) |
+| 05/07/2013 | [CUB v0.9.4 Update Release](https://github.com/NVlabs/cub/archive/0.9.4.zip) |
+| 04/04/2013 | [CUB v0.9.2 Update Release](https://github.com/NVlabs/cub/archive/0.9.2.zip) |
+| 03/09/2013 | [CUB v0.9.1 Update Release](https://github.com/NVlabs/cub/archive/0.9.1.zip) |
+| 03/07/2013 | [CUB v0.9.0 Preview Release](https://github.com/NVlabs/cub/archive/0.9.zip) |
 
 <br><hr>
 <h3>A Simple Example</h3>
 
 ```C++
 #include <cub/cub.cuh>
 
-// An exclusive prefix sum CUDA kernel (for a single-threadblock grid)
-template <
-    int         BLOCK_THREADS,              // Threads per threadblock
-    int         ITEMS_PER_THREAD,           // Items per thread
-    typename    T>                          // Data type
-__global__ void PrefixSumKernel(T *d_in, T *d_out)
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
 {
-    using namespace cub;
-
-    // Parameterize BlockScan for the current execution context
-    typedef BlockScan<T, BLOCK_THREADS> BlockScan;
-
-    // The shared memory needed by BlockScan
-    __shared__ typename BlockScan::TempStorage temp_storage;
-
-    // A segment of data items per thread
-    T data[ITEMS_PER_THREAD];
-
-    // Load a tile of data using vector-load instructions
-    LoadBlockedVectorized(data, d_in, 0);
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
+     // owning 16 integer items each
+     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
+     typedef BlockLoad<int*, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     typedef BlockStore<int*, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
 
-    // Perform an exclusive prefix sum across the tile of data
-    BlockScan::ExclusiveSum(temp_storage, data, data);
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load; 
+         typename BlockStore::TempStorage      store; 
+     } temp_storage; 
+
+     int block_offset = blockIdx.x * (128 * 16);	  // Offset for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
 
-    // Store a tile of data using vector-load instructions
-    StoreBlockedVectorized(data, d_out, 0);
+     // Store the sorted segment 
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
 }
 ```
 

diff --git a/.../block/block_radix_sort_scatter_tiles.cuh → ...lock/block_radix_sort_downsweep_tiles.cuh b/.../block/block_radix_sort_scatter_tiles.cuh → ...lock/block_radix_sort_downsweep_tiles.cuh
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * BlockRadixSortScatterTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
+ * BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
  */
 
 
@@ -66,7 +66,7 @@ enum RadixSortScatterAlgorithm
 
 
 /**
- * Tuning policy for BlockRadixSortScatterTiles
+ * Tuning policy for BlockRadixSortDownsweepTiles
  */
 template <
     int                         _BLOCK_THREADS,             ///< The number of threads per CTA
@@ -79,7 +79,7 @@ template <
     RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
     cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
     int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortScatterTilesPolicy
+struct BlockRadixSortDownsweepTilesPolicy
 {
     enum
     {
@@ -97,7 +97,7 @@ struct BlockRadixSortScatterTilesPolicy
     static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;
     static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;
 
-    typedef BlockRadixSortScatterTilesPolicy<
+    typedef BlockRadixSortDownsweepTilesPolicy<
         BLOCK_THREADS,
         ITEMS_PER_THREAD,
         LOAD_ALGORITHM,
@@ -120,11 +120,11 @@ struct BlockRadixSortScatterTilesPolicy
  * a range of input tiles.
  */
 template <
-    typename BlockRadixSortScatterTilesPolicy,
+    typename BlockRadixSortDownsweepTilesPolicy,
     typename Key,
     typename Value,
     typename SizeT>
-struct BlockRadixSortScatterTiles
+struct BlockRadixSortDownsweepTiles
 {
     //---------------------------------------------------------------------
     // Type definitions and constants
@@ -136,19 +136,19 @@ struct BlockRadixSortScatterTiles
     static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
     static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
 
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortScatterTilesPolicy::LOAD_ALGORITHM;
-    static const PtxLoadModifier            LOAD_MODIFIER           = BlockRadixSortScatterTilesPolicy::LOAD_MODIFIER;
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortScatterTilesPolicy::INNER_SCAN_ALGORITHM;
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortScatterTilesPolicy::SCATTER_ALGORITHM;
-    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortScatterTilesPolicy::SMEM_CONFIG;
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
+    static const PtxLoadModifier            LOAD_MODIFIER           = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;
 
     enum
     {
-        BLOCK_THREADS           = BlockRadixSortScatterTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = BlockRadixSortScatterTilesPolicy::ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING   = BlockRadixSortScatterTilesPolicy::EXCHANGE_TIME_SLICING,
-        RADIX_BITS              = BlockRadixSortScatterTilesPolicy::RADIX_BITS,
-        MEMOIZE_OUTER_SCAN      = BlockRadixSortScatterTilesPolicy::MEMOIZE_OUTER_SCAN,
+        BLOCK_THREADS           = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
         TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
 
         RADIX_DIGITS            = 1 << RADIX_BITS,
@@ -617,7 +617,7 @@ struct BlockRadixSortScatterTiles
     /**
      * Constructor
      */
-    __device__ __forceinline__ BlockRadixSortScatterTiles(
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
         TempStorage &temp_storage,
         SizeT       bin_offset,
         Key         *d_keys_in,
@@ -640,7 +640,7 @@ struct BlockRadixSortScatterTiles
     /**
      * Constructor
      */
-    __device__ __forceinline__ BlockRadixSortScatterTiles(
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
         TempStorage &temp_storage,
         SizeT       num_items,
         SizeT       *d_spine,

diff --git a/...ce/block/block_radix_sort_histo_tiles.cuh → .../block/block_radix_sort_upsweep_tiles.cuh b/...ce/block/block_radix_sort_histo_tiles.cuh → .../block/block_radix_sort_upsweep_tiles.cuh
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * BlockRadixSortHistoTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ * BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
  */
 
 #pragma once
@@ -49,14 +49,14 @@ namespace cub {
  ******************************************************************************/
 
 /**
- * Tuning policy for BlockRadixSortHistoTiles
+ * Tuning policy for BlockRadixSortUpsweepTiles
  */
 template <
     int                 _BLOCK_THREADS,     ///< The number of threads per CTA
     int                 _ITEMS_PER_THREAD,  ///< The number of items to load per thread per tile
     PtxLoadModifier     _LOAD_MODIFIER,     ///< Load cache-modifier
     int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortHistoTilesPolicy
+struct BlockRadixSortUpsweepTilesPolicy
 {
     enum
     {
@@ -68,7 +68,7 @@ struct BlockRadixSortHistoTilesPolicy
 
     static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
 
-    typedef BlockRadixSortHistoTilesPolicy<
+    typedef BlockRadixSortUpsweepTilesPolicy<
         BLOCK_THREADS,
         ITEMS_PER_THREAD,
         LOAD_MODIFIER,
@@ -81,15 +81,15 @@ struct BlockRadixSortHistoTilesPolicy
  ******************************************************************************/
 
 /**
- * \brief BlockRadixSortHistoTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ * \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
  *
  * Computes radix digit histograms over a range of input tiles.
  */
 template <
-    typename BlockRadixSortHistoTilesPolicy,
+    typename BlockRadixSortUpsweepTilesPolicy,
     typename Key,
     typename SizeT>
-struct BlockRadixSortHistoTiles
+struct BlockRadixSortUpsweepTiles
 {
 
     //---------------------------------------------------------------------
@@ -104,13 +104,13 @@ struct BlockRadixSortHistoTiles
     // Integer type for packing DigitCounters into columns of shared memory banks
     typedef unsigned int PackedCounter;
 
-    static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortHistoTilesPolicy::LOAD_MODIFIER;
+    static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;
 
     enum
     {
-        RADIX_BITS              = BlockRadixSortHistoTilesPolicy::RADIX_BITS,
-        BLOCK_THREADS           = BlockRadixSortHistoTilesPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = BlockRadixSortHistoTilesPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
+        BLOCK_THREADS           = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,
 
         RADIX_DIGITS            = 1 << RADIX_BITS,
 
@@ -192,7 +192,7 @@ struct BlockRadixSortHistoTiles
 
         // BucketKeys
         static __device__ __forceinline__ void BucketKeys(
-            BlockRadixSortHistoTiles &cta,
+            BlockRadixSortUpsweepTiles &cta,
             UnsignedBits keys[KEYS_PER_THREAD])
         {
             cta.Bucket(keys[COUNT]);
@@ -202,7 +202,7 @@ struct BlockRadixSortHistoTiles
         }
 
         // ProcessTiles
-        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortHistoTiles &cta, SizeT block_offset)
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
         {
             // Next
             Iterate<1, HALF>::ProcessTiles(cta, block_offset);
@@ -215,10 +215,10 @@ struct BlockRadixSortHistoTiles
     struct Iterate<MAX, MAX>
     {
         // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(BlockRadixSortHistoTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+        static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
 
         // ProcessTiles
-        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortHistoTiles &cta, SizeT block_offset)
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
         {
             cta.ProcessFullTile(block_offset);
         }
@@ -392,7 +392,7 @@ struct BlockRadixSortHistoTiles
     /**
      * Constructor
      */
-    __device__ __forceinline__ BlockRadixSortHistoTiles(
+    __device__ __forceinline__ BlockRadixSortUpsweepTiles(
         TempStorage &temp_storage,
         Key         *d_keys_in,
         int         current_bit)