diff --git a/cub/block_sweep/block_histogram_sweep.cuh b/cub/block_sweep/block_histogram_sweep.cuh index 69263e78f4..219b865a1c 100644 --- a/cub/block_sweep/block_histogram_sweep.cuh +++ b/cub/block_sweep/block_histogram_sweep.cuh @@ -221,6 +221,9 @@ struct BlockHistogramSweep /// The transform operator for determining privatized counter indices from samples, one for each channel PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; + /// Whether to prefer privatized smem counters vs privatized global counters + bool prefer_smem; + //--------------------------------------------------------------------- // Initialize privatized bin counters @@ -614,7 +617,6 @@ struct BlockHistogramSweep // Interface //--------------------------------------------------------------------- - bool prefer_smem; /** * Constructor diff --git a/cub/block_sweep/block_spmv_sweep.cuh b/cub/block_sweep/block_spmv_sweep.cuh index ac732f0b44..e4503ff82d 100644 --- a/cub/block_sweep/block_spmv_sweep.cuh +++ b/cub/block_sweep/block_spmv_sweep.cuh @@ -158,437 +158,37 @@ struct BlockSpmvSweep /// Reference to temp_storage _TempStorage &temp_storage; - /// Sample input iterator (with cache modifier applied, if possible) - WrappedSampleIteratorT d_wrapped_samples; - - /// Native pointer for input samples (possibly NULL if unavailable) - SampleT* d_native_samples; - - /// The number of output bins for each channel - int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; - - /// The number of privatized bins for each channel - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; - - /// Reference to gmem privatized histograms for each channel - CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; - - /// Reference to final output histograms (gmem) - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining privatized counter indices from samples, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; - - - //--------------------------------------------------------------------- - // Initialize privatized bin counters - //--------------------------------------------------------------------- - - // Initialize privatized bin counters - __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Initialize histogram bin counts to zeros - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) - { - privatized_histograms[CHANNEL][privatized_bin] = 0; - } - } - - // Barrier to make sure all threads are done updating counters - __syncthreads(); - } - - - // Initialize privatized bin counters. Specialized for privatized shared-memory counters - __device__ __forceinline__ void InitSmemBinCounters() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - InitBinCounters(privatized_histograms); - } - - - // Initialize privatized bin counters. Specialized for privatized global-memory counters - __device__ __forceinline__ void InitGmemBinCounters() - { - InitBinCounters(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Update final output histograms - //--------------------------------------------------------------------- - - // Update final output histograms from privatized histograms - __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Barrier to make sure all threads are done updating counters - __syncthreads(); - - // Apply privatized bin counts to output bin counts - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int channel_bins = num_privatized_bins[CHANNEL]; - for (int privatized_bin = threadIdx.x; - privatized_bin < channel_bins; - privatized_bin += BLOCK_THREADS) - { - int output_bin; - CounterT count = privatized_histograms[CHANNEL][privatized_bin]; - bool is_valid = count > 0; - output_decode_op[CHANNEL].BinSelect((SampleT) privatized_bin, output_bin, is_valid); - - if (output_bin >= 0) - { - atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); - } - - } - } - } - - - // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters - __device__ __forceinline__ void StoreSmemOutput() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - StoreOutput(privatized_histograms); - } - - - // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters - __device__ __forceinline__ void StoreGmemOutput() - { - StoreOutput(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Accumulate privatized histograms - //--------------------------------------------------------------------- - - // Accumulate pixels. Specialized for RLE compression. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - // Bin pixels - int bins[PIXELS_PER_THREAD]; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - privatized_decode_op[CHANNEL].BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); - - CounterT accumulator = 1; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) - { - - if (bins[PIXEL] < 0) - { - accumulator = 1; - } - else if (bins[PIXEL] == bins[PIXEL + 1]) - { - accumulator++; - } - else - { - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); - accumulator = 1; - } - } - - // Last pixel - if (bins[PIXELS_PER_THREAD - 1] >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); - - } - } - - - // Accumulate pixels. Specialized for individual accumulation of each pixel. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int bin; - privatized_decode_op[CHANNEL].BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); - if (bin >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bin, 1); - } - } - } - - - /** - * Accumulate pixel, specialized for smem privatized histogram - */ - __device__ __forceinline__ void AccumulateSmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); - } - - - /** - * Accumulate pixel, specialized for gmem privatized histogram - */ - __device__ __forceinline__ void AccumulateGmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); - } - - - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - // Load full, aligned tile using pixel iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples)); - } - - // Load full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - // Load using sample iterator - BlockLoadSampleT(temp_storage.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples)); - } - - // Load partially-full, aligned tile using pixel iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - int valid_pixels = valid_samples / NUM_CHANNELS; - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples), - valid_pixels); - - } - - // Load partially-full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - BlockLoadSampleT(temp_storage.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples), - valid_samples); - } - - // Consume a tile of data samples - template < - bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) - bool IS_FULL_TILE> // Whether the tile is full - __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) - { - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; - bool is_valid[PIXELS_PER_THREAD]; - - // Load tile - LoadTile( - block_offset, - valid_samples, - samples, - Int2Type(), - Int2Type()); - - // Set valid flags - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); - - // Accumulate samples - if (prefer_smem) - AccumulateSmemPixels(samples, is_valid); - else - AccumulateGmemPixels(samples, is_valid); - - - } - - - // Consume row tiles. Specialized for work-stealing from queue - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT row_offset, - OffsetT row_end, - int tiles_per_row, - GridQueue tile_queue, - Int2Type is_work_stealing) - { - OffsetT tile_offset = blockIdx.x * TILE_SAMPLES; - OffsetT num_remaining = row_end - tile_offset; - OffsetT even_share_base = gridDim.x * TILE_SAMPLES; - - while (num_remaining >= TILE_SAMPLES) - { - // Consume full tile - ConsumeTile(tile_offset, TILE_SAMPLES); - - __syncthreads(); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_offset = tile_queue.Drain(TILE_SAMPLES) + even_share_base; - - __syncthreads(); - - tile_offset = temp_storage.tile_offset; - num_remaining = row_end - tile_offset; - - } - - if (num_remaining > 0) - { - // Consume the last (and potentially partially-full) tile - ConsumeTile(tile_offset, num_remaining); - } - } - - - // Consume row tiles. Specialized for even-share (striped across thread blocks) - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT row_offset, - OffsetT row_end, - int tiles_per_row, - GridQueue tile_queue, - Int2Type is_work_stealing) - { - OffsetT tile_offset = row_offset + (blockIdx.x * TILE_SAMPLES); - while (tile_offset + TILE_SAMPLES <= row_end) - { - ConsumeTile(tile_offset, TILE_SAMPLES); - tile_offset += gridDim.x * TILE_SAMPLES; - } - - if (tile_offset < row_end) - { - int valid_samples = row_end - tile_offset; - ConsumeTile(tile_offset, valid_samples); - } - } - - - //--------------------------------------------------------------------- - // Parameter extraction - //--------------------------------------------------------------------- - - // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) - template < - CacheLoadModifier _MODIFIER, - typename _ValueT, - typename _OffsetT> - __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) - { - return itr.ptr; - } - - // Return a native pixel pointer (specialized for other types) - template - __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) - { - return NULL; - } - + MatrixValueIteratorT d_matrix_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + MatrixRowOffsetsIteratorT d_matrix_row_offsets; ///< Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_matrix_column_indices and \p d_matrix_values (with the final entry being equal to \p num_nonzeros) + MatrixColumnIndicesIteratorT d_matrix_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + VectorValueIteratorT d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y + VertexT* d_block_carryout_rows; ///< Pointer to the temporary array carry-out dot product row-ids, one per block + ValueT* d_block_runout_values; ///< Pointer to the temporary array carry-out dot product partial-sums, one per block + int num_rows; ///< The number of rows of matrix A. + int num_cols; ///< The number of columns of matrix A. + int num_nonzeros; ///< The number of nonzero elements of matrix A. //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- - bool prefer_smem; - /** * Constructor */ __device__ __forceinline__ BlockSpmvSweep( - TempStorage &temp_storage, ///< Reference to temp_storage - SampleIteratorT d_samples, ///< Input data to reduce - int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms - CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel + TempStorage &temp_storage, ///< Reference to temp_storage + ValueT* d_matrix_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + OffsetT* d_matrix_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_matrix_column_indices and \p d_matrix_values (with the final entry being equal to \p num_nonzeros) + VertexT* d_matrix_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + VertexT* d_block_carryout_rows, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + ValueT* d_block_runout_values, ///< [out] Pointer to the temporary array carry-out dot product partial-sums, one per block + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros) ///< [in] number of nonzero elements of matrix A. : temp_storage(temp_storage.Alias()), d_wrapped_samples(d_samples), diff --git a/cub/device/dispatch/device_spmv_dispatch.cuh b/cub/device/dispatch/device_spmv_dispatch.cuh index f54424af08..1121ee0446 100644 --- a/cub/device/dispatch/device_spmv_dispatch.cuh +++ b/cub/device/dispatch/device_spmv_dispatch.cuh @@ -39,8 +39,6 @@ #include #include "../../block_sweep/block_spmv_sweep.cuh" -#include "../device_radix_sort.cuh" -#include "../../iterator/tex_ref_input_iterator.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" #include "../../thread/thread_search.cuh" @@ -83,13 +81,8 @@ __global__ void DeviceSpmvSweepKernel( // Thread block type for compositing input tiles typedef BlockSpmvSweep< BlockSpmvSweepPolicyT, - PRIVATIZED_SMEM_BINS, - NUM_CHANNELS, - NUM_ACTIVE_CHANNELS, - SampleIteratorT, - CounterT, - PrivatizedDecodeOpT, - OutputDecodeOpT, + VertexT, + ValueT, OffsetT> BlockSpmvSweepT; @@ -106,19 +99,13 @@ __global__ void DeviceSpmvSweepKernel( output_decode_op_wrapper.array, privatized_decode_op_wrapper.array); - // Initialize counters - block_sweep.InitBinCounters(); - - // Consume input tiles + // Consume input block_sweep.ConsumeTiles( num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue); - - // Store output to global (if necessary) - block_sweep.StoreOutput(); }