Skip to content
This repository has been archived by the owner on Apr 3, 2024. It is now read-only.

Commit

Permalink
Update radix sort policies
Browse files Browse the repository at this point in the history
Former-commit-id: aca50f0a12128a6f8b2f15d83fcb26eed4a781ca
  • Loading branch information
dumerrill committed Feb 20, 2015
1 parent 0cb2174 commit 6330f93
Show file tree
Hide file tree
Showing 31 changed files with 579 additions and 706 deletions.
10 changes: 5 additions & 5 deletions .cproject
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
<folderInfo id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113" name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.1285519857" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.365256368" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.446293184" name="Gnu Make Builder.Default" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.446293184" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.243775805" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1808657577" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.880994059" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.586941236" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
<option id="gnu.cpp.compiler.option.include.paths.1901125882" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<option id="gnu.cpp.compiler.option.include.paths.1901125882" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.690074796" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<option id="gnu.cpp.compiler.option.preprocessor.def.690074796" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="__device__"/>
<listOptionValue builtIn="false" value="__global__"/>
<listOptionValue builtIn="false" value="__shared__"/>
Expand All @@ -44,10 +44,10 @@
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1654082299" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1149397878" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
<option id="gnu.c.compiler.option.include.paths.18887661" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
<option id="gnu.c.compiler.option.include.paths.18887661" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
</option>
<option id="gnu.c.compiler.option.preprocessor.def.symbols.788454999" superClass="gnu.c.compiler.option.preprocessor.def.symbols" valueType="definedSymbols">
<option id="gnu.c.compiler.option.preprocessor.def.symbols.788454999" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" valueType="definedSymbols">
<listOptionValue builtIn="false" value="__device__"/>
<listOptionValue builtIn="false" value="__global__"/>
<listOptionValue builtIn="false" value="__shared__"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* cub::BlockHistogramSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram across a range of tiles.
* cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram across a range of tiles.
*/

#pragma once
Expand Down Expand Up @@ -64,7 +64,7 @@ enum BlockHistogramMemoryPreference


/**
* Parameterizable tuning policy type for BlockHistogramSweep
* Parameterizable tuning policy type for AgentHistogram
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
Expand All @@ -74,7 +74,7 @@ template <
bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming
BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue
struct BlockHistogramSweepPolicy
struct AgentHistogramPolicy
{
enum
{
Expand All @@ -95,10 +95,10 @@ struct BlockHistogramSweepPolicy
******************************************************************************/

/**
* \brief BlockHistogramSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram across a range of tiles.
* \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram across a range of tiles.
*/
template <
typename BlockHistogramSweepPolicyT, ///< Parameterized BlockHistogramSweepPolicy tuning policy type
typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type
int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in global memory.
int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels.
int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
Expand All @@ -108,7 +108,7 @@ template <
typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
typename OffsetT, ///< Signed integer type for global offsets
int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
struct BlockHistogramSweep
struct AgentHistogram
{
//---------------------------------------------------------------------
// Types and constants
Expand All @@ -123,25 +123,25 @@ struct BlockHistogramSweep
/// Constants
enum
{
BLOCK_THREADS = BlockHistogramSweepPolicyT::BLOCK_THREADS,
BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,

PIXELS_PER_THREAD = BlockHistogramSweepPolicyT::PIXELS_PER_THREAD,
PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD,
TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS,

SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,

RLE_COMPRESS = BlockHistogramSweepPolicyT::RLE_COMPRESS,
RLE_COMPRESS = AgentHistogramPolicyT::RLE_COMPRESS,

MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ?
BlockHistogramSweepPolicyT::MEM_PREFERENCE :
AgentHistogramPolicyT::MEM_PREFERENCE :
GMEM,

WORK_STEALING = BlockHistogramSweepPolicyT::WORK_STEALING,
WORK_STEALING = AgentHistogramPolicyT::WORK_STEALING,
};

/// Cache load modifier for reading input elements
static const CacheLoadModifier LOAD_MODIFIER = BlockHistogramSweepPolicyT::LOAD_MODIFIER;
static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;


/// Input iterator wrapper type (for applying cache modifier)
Expand All @@ -159,15 +159,15 @@ struct BlockHistogramSweep
WrappedSampleIteratorT,
BLOCK_THREADS,
SAMPLES_PER_THREAD,
BlockHistogramSweepPolicyT::LOAD_ALGORITHM>
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadSampleT;

/// Parameterized BlockLoad type for pixels
typedef BlockLoad<
WrappedPixelIteratorT,
BLOCK_THREADS,
PIXELS_PER_THREAD,
BlockHistogramSweepPolicyT::LOAD_ALGORITHM>
AgentHistogramPolicyT::LOAD_ALGORITHM>
BlockLoadPixelT;


Expand Down Expand Up @@ -621,7 +621,7 @@ struct BlockHistogramSweep
/**
* Constructor
*/
__device__ __forceinline__ BlockHistogramSweep(
__device__ __forceinline__ AgentHistogram(
TempStorage &temp_storage, ///< Reference to temp_storage
SampleIteratorT d_samples, ///< Input data to reduce
int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* BlockRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
* AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
*/


Expand Down Expand Up @@ -65,26 +65,24 @@ enum RadixSortScatterAlgorithm


/**
* Parameterizable tuning policy type for BlockRadixSortDownsweep
* Parameterizable tuning policy type for AgentRadixSortDownsweep
*/
template <
int _BLOCK_THREADS, ///< Threads per thread block
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values)
bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The BlockScan algorithm algorithm to use
RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use
cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode
int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins)
struct BlockRadixSortDownsweepPolicy
struct AgentRadixSortDownsweepPolicy
{
enum
{
BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
};
Expand All @@ -102,15 +100,15 @@ struct BlockRadixSortDownsweepPolicy
******************************************************************************/

/**
* \brief BlockRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
* \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
*/
template <
typename BlockRadixSortDownsweepPolicy, ///< Parameterized BlockRadixSortDownsweepPolicy tuning policy type
typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
bool DESCENDING, ///< Whether or not the sorted-order is high-to-low
typename KeyT, ///< KeyT type
typename ValueT, ///< ValueT type
typename OffsetT> ///< Signed integer type for global offsets
struct BlockRadixSortDownsweep
struct AgentRadixSortDownsweep
{
//---------------------------------------------------------------------
// Type definitions and constants
Expand All @@ -122,19 +120,18 @@ struct BlockRadixSortDownsweep
static const UnsignedBits MIN_KEY = Traits<KeyT>::MIN_KEY;
static const UnsignedBits MAX_KEY = Traits<KeyT>::MAX_KEY;

static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRadixSortDownsweepPolicy::LOAD_ALGORITHM;
static const CacheLoadModifier LOAD_MODIFIER = BlockRadixSortDownsweepPolicy::LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = BlockRadixSortDownsweepPolicy::SMEM_CONFIG;
static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = AgentRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
static const cudaSharedMemConfig SMEM_CONFIG = AgentRadixSortDownsweepPolicy::SMEM_CONFIG;

enum
{
BLOCK_THREADS = BlockRadixSortDownsweepPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = BlockRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING = BlockRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING,
RADIX_BITS = BlockRadixSortDownsweepPolicy::RADIX_BITS,
MEMOIZE_OUTER_SCAN = BlockRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS,
MEMOIZE_OUTER_SCAN = AgentRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

RADIX_DIGITS = 1 << RADIX_BITS,
Expand Down Expand Up @@ -174,30 +171,26 @@ struct BlockRadixSortDownsweep
KeysItr,
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
EXCHANGE_TIME_SLICING> BlockLoadKeys;
LOAD_ALGORITHM> BlockLoadKeys;

// BlockLoad type (values)
typedef BlockLoad<
ValuesItr,
BLOCK_THREADS,
ITEMS_PER_THREAD,
LOAD_ALGORITHM,
EXCHANGE_TIME_SLICING> BlockLoadValues;
LOAD_ALGORITHM> BlockLoadValues;

// BlockExchange type (keys)
typedef BlockExchange<
UnsignedBits,
BLOCK_THREADS,
ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING> BlockExchangeKeys;
ITEMS_PER_THREAD> BlockExchangeKeys;

// BlockExchange type (values)
typedef BlockExchange<
ValueT,
BLOCK_THREADS,
ITEMS_PER_THREAD,
EXCHANGE_TIME_SLICING> BlockExchangeValues;
ITEMS_PER_THREAD> BlockExchangeValues;


/**
Expand Down Expand Up @@ -636,7 +629,7 @@ struct BlockRadixSortDownsweep
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortDownsweep(
__device__ __forceinline__ AgentRadixSortDownsweep(
TempStorage &temp_storage,
OffsetT bin_offset,
KeyT *d_keys_in,
Expand All @@ -661,7 +654,7 @@ struct BlockRadixSortDownsweep
/**
* Constructor
*/
__device__ __forceinline__ BlockRadixSortDownsweep(
__device__ __forceinline__ AgentRadixSortDownsweep(
TempStorage &temp_storage,
OffsetT num_items,
OffsetT *d_spine,
Expand Down
Loading

0 comments on commit 6330f93

Please sign in to comment.