From ce261ed64b89be0748e302592e092363d8b08565 Mon Sep 17 00:00:00 2001 From: dumerrill Date: Mon, 23 Feb 2015 17:37:57 -0500 Subject: [PATCH] - Added sm52 scan tuning - Added fast in-core processing for very small sorting problems Former-commit-id: 7702e4dd652a9cd43e93f168d3522dc9a06d1ebe --- cub/agent/agent_radix_sort_downsweep.cuh | 4 + cub/agent/agent_scan.cuh | 39 -- cub/block/block_radix_sort.cuh | 8 +- cub/device/dispatch/dispatch_radix_sort.cuh | 428 ++++++++++++++------ cub/device/dispatch/dispatch_scan.cuh | 54 ++- test/test_device_scan.cu | 10 +- 6 files changed, 346 insertions(+), 197 deletions(-) diff --git a/cub/agent/agent_radix_sort_downsweep.cuh b/cub/agent/agent_radix_sort_downsweep.cuh index 316ca955ed..425e25edea 100644 --- a/cub/agent/agent_radix_sort_downsweep.cuh +++ b/cub/agent/agent_radix_sort_downsweep.cuh @@ -571,6 +571,9 @@ struct AgentRadixSortDownsweep GatherScatterValues(values, relative_bin_offsets, ranks, block_offset, valid_items); } + //--------------------------------------------------------------------- + // Copy shortcut + //--------------------------------------------------------------------- /** * Copy tiles within the range of input @@ -728,6 +731,7 @@ struct AgentRadixSortDownsweep } } } + }; diff --git a/cub/agent/agent_scan.cuh b/cub/agent/agent_scan.cuh index ba07574547..48edd7db00 100644 --- a/cub/agent/agent_scan.cuh +++ b/cub/agent/agent_scan.cuh @@ -367,7 +367,6 @@ struct AgentScan GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks ScanTileState &tile_status) ///< Global list of tile status { -#if (CUB_PTX_ARCH <= 130) // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; // Current tile index @@ -378,44 +377,6 @@ struct AgentScan ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); else if (num_remaining > 0) ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - -#else - // Blocks may not be launched in increasing order, so work-steal tiles - - // Get first tile index - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - int tile_idx = temp_storage.tile_idx; - OffsetT block_offset = TILE_ITEMS * tile_idx; - OffsetT num_remaining = num_items - block_offset; - - while (num_remaining >= TILE_ITEMS) - { - // Consume full tile - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - tile_idx = temp_storage.tile_idx; - block_offset = TILE_ITEMS * tile_idx; - num_remaining = num_items - block_offset; - } - - // Consume the last (and potentially partially-full) tile - if (num_remaining > 0) - { - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - } - -#endif - } diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh index 032f367838..4adab36820 100644 --- a/cub/block/block_radix_sort.cuh +++ b/cub/block/block_radix_sort.cuh @@ -332,6 +332,10 @@ private: } } +public: + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + /// Sort blocked -> striped arrangement template __device__ __forceinline__ void SortBlockedToStriped( @@ -394,9 +398,7 @@ private: } } - - -public: +#endif // DOXYGEN_SHOULD_SKIP_THIS /// \smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 7b227529ae..03c27aa2f7 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -40,7 +40,9 @@ #include "../../agent/agent_radix_sort_upsweep.cuh" #include "../../agent/agent_radix_sort_downsweep.cuh" #include "../../agent/agent_scan.cuh" +#include "../../block/block_radix_sort.cuh" #include "../../grid/grid_even_share.cuh" +#include "../../util_type.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" #include "../../util_namespace.cuh" @@ -169,6 +171,120 @@ __global__ void DeviceRadixSortDownsweepKernel( } +/** + * Single pass kernel entry point (single-block). Fully sorts a tile of input. + */ +template < + typename AgentRadixSortDownsweepPolicy, ///< Parameterizable tuning policy type for cub::AgentRadixSortUpsweep abstraction + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentRadixSortDownsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortSingleKernel( + KeyT *d_keys_in, ///< [in] Input keys ping buffer + KeyT *d_keys_out, ///< [in] Output keys pong buffer + ValueT *d_values_in, ///< [in] Input values ping buffer + ValueT *d_values_out, ///< [in] Output values pong buffer + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison +{ + // Appropriate unsigned-bits representation of KeyT + typedef typename Traits::UnsignedBits UnsignedBits; + + // Min and max key value + const UnsignedBits MIN_KEY = Traits::MIN_KEY; + const UnsignedBits MAX_KEY = Traits::MAX_KEY; + + // Constants + enum + { + BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + KEYS_ONLY = Equals::VALUE, + }; + + // BlockRadixSort type + typedef BlockRadixSort< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ValueT, + AgentRadixSortDownsweepPolicy::RADIX_BITS, + AgentRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN, + AgentRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM, + AgentRadixSortDownsweepPolicy::SMEM_CONFIG> + BlockRadixSortT; + + // BlockLoad type (keys) + typedef BlockLoad< + KeyT*, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT*, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM> BlockLoadValues; + + + // Shared memory storage + __shared__ struct + { + typename BlockRadixSortT::TempStorage sort; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + + } temp_storage; + + // Keys and values for the block + KeyT keys[ITEMS_PER_THREAD]; + ValueT values[ITEMS_PER_THREAD]; + + // Assign default (min/max) value to all keys + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY; + } + + // Load keys + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items); + + // Load values + if (!KEYS_ONLY) + { + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); + } + + // Sort tile + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( + keys, + values, + current_bit, + end_bit, + Int2Type(), + Int2Type()); + + // Store keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; + if (item_offset < num_items) + { + d_keys_out[item_offset] = keys[ITEM]; + if (!KEYS_ONLY) + d_values_out[item_offset] = values[ITEM]; + } + } +} + + /****************************************************************************** * Dispatch @@ -209,33 +325,7 @@ struct DispatchRadixSort PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, }; -/* - // Primary UpsweepPolicy (passes having digit-length RADIX_BITS) - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef typename If::Type UpsweepPolicy; - - // Alternate UpsweepPolicy (passes having digit-length ALT_RADIX_BITS) - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_LDG, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - typedef typename If::Type AltUpsweepPolicy; - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Primary DownsweepPolicy - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - - // Alternate DownsweepPolicy for ALT_RADIX_BITS-bit passes - typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; -*/ - - // ScanPolicy typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicy; @@ -244,6 +334,7 @@ struct DispatchRadixSort typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, PRIMARY_RADIX_BITS> DownsweepPolicy; typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_RAKING_MEMOIZE, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicy; + typedef DownsweepPolicy SinglePolicy; }; @@ -277,6 +368,8 @@ struct DispatchRadixSort typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 11 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyPairs; typedef typename If::Type AltDownsweepPolicy; + + typedef DownsweepPolicy SinglePolicy; }; @@ -310,6 +403,8 @@ struct DispatchRadixSort typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyPairs; typedef typename If::Type AltDownsweepPolicy; + + typedef DownsweepPolicy SinglePolicy; }; @@ -343,6 +438,8 @@ struct DispatchRadixSort typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyPairs; typedef typename If::Type AltDownsweepPolicy; + + typedef DownsweepPolicy SinglePolicy; }; @@ -376,6 +473,8 @@ struct DispatchRadixSort typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyKeys; typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicyPairs; typedef typename If::Type AltDownsweepPolicy; + + typedef DownsweepPolicy SinglePolicy; }; @@ -401,6 +500,8 @@ struct DispatchRadixSort // Alternate DownsweepPolicy for ALT_RADIX_BITS-bit passes typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, ALT_RADIX_BITS> AltDownsweepPolicy; + + typedef DownsweepPolicy SinglePolicy; }; @@ -434,6 +535,7 @@ struct DispatchRadixSort struct PtxScanPolicy : PtxPolicy::ScanPolicy {}; struct PtxDownsweepPolicy : PtxPolicy::DownsweepPolicy {}; struct PtxAltDownsweepPolicy : PtxPolicy::AltDownsweepPolicy {}; + struct PtxSinglePolicy : PtxPolicy::SinglePolicy {}; /****************************************************************************** @@ -447,8 +549,9 @@ struct DispatchRadixSort typename Policy, typename KernelConfig, typename UpsweepKernelPtr, ///< Function type of cub::DeviceRadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr, ///< Function type of cub::DeviceRadixSortDownsweepKernel + typename SingleKernelPtr> ///< Function type of cub::DeviceRadixSortSingleKernel CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t InitConfigs( int sm_version, @@ -458,19 +561,22 @@ struct DispatchRadixSort KernelConfig &scan_config, KernelConfig &downsweep_config, KernelConfig &alt_downsweep_config, + KernelConfig &single_config, UpsweepKernelPtr upsweep_kernel, UpsweepKernelPtr alt_upsweep_kernel, ScanKernelPtr scan_kernel, DownsweepKernelPtr downsweep_kernel, - DownsweepKernelPtr alt_downsweep_kernel) + DownsweepKernelPtr alt_downsweep_kernel, + SingleKernelPtr single_kernel) { cudaError_t error; do { - if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; - if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; - if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; - if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; - if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; + if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; + if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; + if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; + if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; + if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; + if (CubDebug(error = single_config.template InitSinglePolicy( sm_version, sm_count, single_kernel))) break; } while (0); @@ -484,8 +590,9 @@ struct DispatchRadixSort template < typename KernelConfig, typename UpsweepKernelPtr, ///< Function type of cub::DeviceRadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr, ///< Function type of cub::DeviceRadixSortDownsweepKernel + typename SingleKernelPtr> ///< Function type of cub::DeviceRadixSortSingleKernel CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t InitConfigs( int ptx_version, @@ -496,11 +603,13 @@ struct DispatchRadixSort KernelConfig &scan_config, KernelConfig &downsweep_config, KernelConfig &alt_downsweep_config, + KernelConfig &single_config, UpsweepKernelPtr upsweep_kernel, UpsweepKernelPtr alt_upsweep_kernel, ScanKernelPtr scan_kernel, DownsweepKernelPtr downsweep_kernel, - DownsweepKernelPtr alt_downsweep_kernel) + DownsweepKernelPtr alt_downsweep_kernel, + SingleKernelPtr single_kernel) { #if (CUB_PTX_ARCH > 0) @@ -513,6 +622,7 @@ struct DispatchRadixSort if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; + if (CubDebug(error = single_config.template InitSinglePolicy( sm_version, sm_count, single_kernel))) break; } while (0); @@ -524,27 +634,27 @@ struct DispatchRadixSort cudaError_t error; if (ptx_version >= 520) { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } else if (ptx_version >= 350) { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } else if (ptx_version >= 300) { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } else if (ptx_version >= 200) { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } else if (ptx_version >= 130) { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } else { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel); } return error; @@ -562,7 +672,6 @@ struct DispatchRadixSort int block_threads; int items_per_thread; int tile_size; - cudaSharedMemConfig smem_config; int radix_bits; int sm_occupancy; int max_grid_size; @@ -605,7 +714,6 @@ struct DispatchRadixSort block_threads = DownsweepPolicy::BLOCK_THREADS; items_per_thread = DownsweepPolicy::ITEMS_PER_THREAD; radix_bits = DownsweepPolicy::RADIX_BITS; - smem_config = DownsweepPolicy::SMEM_CONFIG; tile_size = block_threads * items_per_thread; cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads); subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); @@ -613,6 +721,22 @@ struct DispatchRadixSort return retval; } + + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitSinglePolicy( + int sm_version, int sm_count, SingleKernelPtr single_kernel) + { + block_threads = SinglePolicy::BLOCK_THREADS; + items_per_thread = SinglePolicy::ITEMS_PER_THREAD; + radix_bits = SinglePolicy::RADIX_BITS; + tile_size = block_threads * items_per_thread; + sm_occupancy = 1; + subscription_factor = 1; + max_grid_size = 1; + + return cudaSuccess; + } + }; @@ -638,7 +762,7 @@ struct DispatchRadixSort int spine_length, ///< [in] Number of histogram counters OffsetT num_items, ///< [in] Number of items to reduce int current_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int pass_bits, ///< [in] The number of bits needed for key comparison (less than or equal to radix digit size for this pass) + int pass_bits, ///< [in] The number of bits needed for key comparison (less than or equal to radix digit size for this pass) cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. KernelConfig &upsweep_config, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for @@ -661,8 +785,8 @@ struct DispatchRadixSort { // Log upsweep_kernel configuration if (debug_synchronous) - CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, current_bit, downsweep_config.radix_bits); + CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, current_bit, downsweep_config.radix_bits); // Invoke upsweep_kernel with same grid size as downsweep_kernel upsweep_kernel<<>>( @@ -695,8 +819,8 @@ struct DispatchRadixSort if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; // Log downsweep_kernel configuration - if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n", - even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy); + if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.items_per_thread, downsweep_config.sm_occupancy); // Invoke downsweep_kernel downsweep_kernel<<>>( @@ -732,7 +856,8 @@ struct DispatchRadixSort template < typename UpsweepKernelPtr, ///< Function type of cub::DeviceRadixSortUpsweepKernel typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename DownsweepKernelPtr, ///< Function type of cub::DeviceRadixSortDownsweepKernel + typename SingleKernelPtr> ///< Function type of cub::DeviceRadixSortSingleKernel CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -747,8 +872,9 @@ struct DispatchRadixSort UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel UpsweepKernelPtr alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - DownsweepKernelPtr alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + DownsweepKernelPtr alt_downsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + SingleKernelPtr single_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortSingleKernel { #ifndef CUB_RUNTIME_ENABLED @@ -782,98 +908,96 @@ struct DispatchRadixSort if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; // Initialize kernel dispatch configurations - KernelConfig upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config; + KernelConfig upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config; if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count, - upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, - upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break; + upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, single_config, + upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel, single_kernel))) break; - // Get maximum spine length (conservatively based upon the larger, primary digit size) - int max_grid_size = CUB_MAX(downsweep_config.max_grid_size, alt_downsweep_config.max_grid_size); - int spine_length = (max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + int num_passes; - // Temporary storage allocation requirements - void* allocations[3]; - size_t allocation_sizes[3] = + if (num_items <= single_config.tile_size) { - spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms - (!ALT_STORAGE) ? 0 : num_items * sizeof(Key), // bytes needed for 3rd keys buffer - (!ALT_STORAGE || (KEYS_ONLY)) ? 0 : num_items * sizeof(Value), // bytes needed for 3rd values buffer - }; + // Sort entire problem locally within a single thread block + num_passes = 0; + + // Log single_kernel configuration + if (debug_synchronous) + CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + 1, single_config.block_threads, (long long) stream, single_config.items_per_thread, single_config.sm_occupancy, begin_bit, single_config.radix_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + single_kernel<<<1, single_config.block_threads, 0, stream>>>( + d_keys.Current(), + (ALT_STORAGE) ? d_keys.Alternate() : d_keys.Current(), + d_values.Current(), + (ALT_STORAGE) ? d_values.Alternate() : d_values.Current(), + num_items, + begin_bit, + end_bit); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + } + else + { + // Run multiple global digit-place passes + // Get maximum spine length (conservatively based upon the larger, primary digit size) + int max_grid_size = CUB_MAX(downsweep_config.max_grid_size, alt_downsweep_config.max_grid_size); + int spine_length = (max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[3]; + size_t allocation_sizes[3] = + { + spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms + (!ALT_STORAGE) ? 0 : num_items * sizeof(Key), // bytes needed for 3rd keys buffer + (!ALT_STORAGE || (KEYS_ONLY)) ? 0 : num_items * sizeof(Value), // bytes needed for 3rd values buffer + }; - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - return cudaSuccess; + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - // Alias the allocation for the privatized per-block digit histograms - OffsetT *d_spine; - d_spine = static_cast(allocations[0]); + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; - // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size - int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + downsweep_config.radix_bits - 1) / downsweep_config.radix_bits; - bool is_odd_passes = num_passes & 1; + // Alias the allocation for the privatized per-block digit histograms + OffsetT *d_spine; + d_spine = static_cast(allocations[0]); - int max_alt_passes = (num_passes * downsweep_config.radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits)); + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int num_bits = end_bit - begin_bit; + num_passes = (num_bits + downsweep_config.radix_bits - 1) / downsweep_config.radix_bits; + bool is_odd_passes = num_passes & 1; - DoubleBuffer d_keys_remaining_passes( - (!ALT_STORAGE || is_odd_passes) ? d_keys.Alternate() : static_cast(allocations[1]), - (!ALT_STORAGE) ? d_keys.Current() : (is_odd_passes) ? static_cast(allocations[1]) : d_keys.Alternate()); + int max_alt_passes = (num_passes * downsweep_config.radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits)); - DoubleBuffer d_values_remaining_passes( - (!ALT_STORAGE || is_odd_passes) ? d_values.Alternate() : static_cast(allocations[2]), - (!ALT_STORAGE) ? d_values.Current() : (is_odd_passes) ? static_cast(allocations[2]) : d_values.Alternate()); + DoubleBuffer d_keys_remaining_passes( + (!ALT_STORAGE || is_odd_passes) ? d_keys.Alternate() : static_cast(allocations[1]), + (!ALT_STORAGE) ? d_keys.Current() : (is_odd_passes) ? static_cast(allocations[1]) : d_keys.Alternate()); - // Get even-share work distribution descriptors - GridEvenShare even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); - GridEvenShare alt_even_share(num_items, alt_downsweep_config.max_grid_size, CUB_MAX(alt_downsweep_config.tile_size, alt_upsweep_config.tile_size)); + DoubleBuffer d_values_remaining_passes( + (!ALT_STORAGE || is_odd_passes) ? d_values.Alternate() : static_cast(allocations[2]), + (!ALT_STORAGE) ? d_values.Current() : (is_odd_passes) ? static_cast(allocations[2]) : d_values.Alternate()); - // Run first pass - int current_bit = begin_bit; - if (current_bit < alt_end_bit) - { - // Alternate digit-length pass - int pass_bits = CUB_MIN(alt_downsweep_config.radix_bits, (end_bit - current_bit)); - DispatchPass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, - alt_upsweep_config, scan_config, alt_downsweep_config, - alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, - alt_even_share); - - current_bit += alt_downsweep_config.radix_bits; - } - else - { - // Preferred digit-length pass - int pass_bits = CUB_MIN(downsweep_config.radix_bits, (end_bit - current_bit)); - DispatchPass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, - upsweep_config, scan_config, downsweep_config, - upsweep_kernel, scan_kernel, downsweep_kernel, - even_share); - - current_bit += downsweep_config.radix_bits; - } + // Get even-share work distribution descriptors + GridEvenShare even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + GridEvenShare alt_even_share(num_items, alt_downsweep_config.max_grid_size, CUB_MAX(alt_downsweep_config.tile_size, alt_upsweep_config.tile_size)); - // Run remaining passes - while (current_bit < end_bit) - { + // Run first pass + int current_bit = begin_bit; if (current_bit < alt_end_bit) { // Alternate digit-length pass int pass_bits = CUB_MIN(alt_downsweep_config.radix_bits, (end_bit - current_bit)); DispatchPass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, alt_upsweep_config, scan_config, alt_downsweep_config, alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, @@ -886,10 +1010,8 @@ struct DispatchRadixSort // Preferred digit-length pass int pass_bits = CUB_MIN(downsweep_config.radix_bits, (end_bit - current_bit)); DispatchPass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, upsweep_config, scan_config, downsweep_config, upsweep_kernel, scan_kernel, downsweep_kernel, @@ -898,9 +1020,46 @@ struct DispatchRadixSort current_bit += downsweep_config.radix_bits; } - // Invert selectors and update current bit - d_keys_remaining_passes.selector ^= 1; - d_values_remaining_passes.selector ^= 1; + // Run remaining passes + while (current_bit < end_bit) + { + if (current_bit < alt_end_bit) + { + // Alternate digit-length pass + int pass_bits = CUB_MIN(alt_downsweep_config.radix_bits, (end_bit - current_bit)); + DispatchPass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, + alt_upsweep_config, scan_config, alt_downsweep_config, + alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, + alt_even_share); + + current_bit += alt_downsweep_config.radix_bits; + } + else + { + // Preferred digit-length pass + int pass_bits = CUB_MIN(downsweep_config.radix_bits, (end_bit - current_bit)); + DispatchPass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, num_items, current_bit, pass_bits, stream, debug_synchronous, + upsweep_config, scan_config, downsweep_config, + upsweep_kernel, scan_kernel, downsweep_kernel, + even_share); + + current_bit += downsweep_config.radix_bits; + } + + // Invert selectors and update current bit + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } } // Update selector @@ -955,7 +1114,8 @@ struct DispatchRadixSort DeviceRadixSortUpsweepKernel, RadixSortScanBinsKernel, DeviceRadixSortDownsweepKernel, - DeviceRadixSortDownsweepKernel); + DeviceRadixSortDownsweepKernel, + DeviceRadixSortSingleKernel); } }; diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index a5cc8dc14e..7c96ae284e 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -149,6 +149,24 @@ struct DispatchScan * Tuning policies ******************************************************************************/ + /// SM520 + struct Policy520 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 16, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX980: 20.5B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + RangeScanPolicy; + }; + /// SM35 struct Policy350 { @@ -246,7 +264,10 @@ struct DispatchScan * Tuning policies of current PTX compiler pass ******************************************************************************/ -#if (CUB_PTX_ARCH >= 350) +#if (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) typedef Policy350 PtxPolicy; #elif (CUB_PTX_ARCH >= 300) @@ -288,7 +309,11 @@ struct DispatchScan #else // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) + if (ptx_version >= 520) + { + device_scan_sweep_config.template Init(); + } + else if (ptx_version >= 350) { device_scan_sweep_config.template Init(); } @@ -446,26 +471,15 @@ struct DispatchScan device_scan_sweep_kernel, device_scan_sweep_config.block_threads))) break; + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + // Get grid size for scanning tiles dim3 scan_grid_size; - if (ptx_version <= 130) - { - // Blocks are launched in order, so just assign one block per tile - int max_dim_x = 32 * 1024; - scan_grid_size.z = 1; - scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - } - else - { - // Blocks may not be launched in order, so use atomics - int range_scan_occupancy = range_scan_sm_occupancy * sm_count; // Whole-device occupancy for device_scan_sweep_kernel - scan_grid_size.z = 1; - scan_grid_size.y = 1; - scan_grid_size.x = (num_tiles < range_scan_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - range_scan_occupancy; // Fill the device with threadblocks - } + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); // Log device_scan_sweep_kernel configuration if (debug_synchronous) CubLog("Invoking device_scan_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu index c236d5eae1..afb65f619a 100644 --- a/test/test_device_scan.cu +++ b/test/test_device_scan.cu @@ -54,6 +54,7 @@ using namespace cub; bool g_verbose = false; int g_timing_iterations = 0; int g_repeat = 0; +double g_bandwidth_GBs; CachingDeviceAllocator g_allocator(true); // Dispatch types @@ -569,7 +570,7 @@ void Test( float avg_millis = elapsed_millis / g_timing_iterations; float grate = float(num_items) / avg_millis / 1000.0 / 1000.0; float gbandwidth = grate * sizeof(T) * 2; - printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, grate, gbandwidth); + printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, grate, gbandwidth, gbandwidth / g_bandwidth_GBs * 100.0); } printf("\n\n"); @@ -829,6 +830,13 @@ int main(int argc, char** argv) int sm_version; CubDebugExit(SmVersion(sm_version, device_ordinal)); + // Get GPU device bandwidth (GB/s) + int bus_width, mem_clock_khz; + CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal)); + CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal)); + g_bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000; + + #ifdef QUICKER_TEST // Compile/run basic CUB test