diff --git a/cub/block_sweep/block_histogram_sweep.cuh b/cub/block_sweep/block_histogram_sweep.cuh
index 69263e78f4..219b865a1c 100644
--- a/cub/block_sweep/block_histogram_sweep.cuh
+++ b/cub/block_sweep/block_histogram_sweep.cuh
@@ -221,6 +221,9 @@ struct BlockHistogramSweep
     /// The transform operator for determining privatized counter indices from samples, one for each channel
     PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
 
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
 
     //---------------------------------------------------------------------
     // Initialize privatized bin counters
@@ -614,7 +617,6 @@ struct BlockHistogramSweep
     // Interface
     //---------------------------------------------------------------------
 
-    bool prefer_smem;
 
     /**
      * Constructor
diff --git a/cub/block_sweep/block_spmv_sweep.cuh b/cub/block_sweep/block_spmv_sweep.cuh
index ac732f0b44..e4503ff82d 100644
--- a/cub/block_sweep/block_spmv_sweep.cuh
+++ b/cub/block_sweep/block_spmv_sweep.cuh
@@ -158,437 +158,37 @@ struct BlockSpmvSweep
     /// Reference to temp_storage
     _TempStorage &temp_storage;
 
-    /// Sample input iterator (with cache modifier applied, if possible)
-    WrappedSampleIteratorT d_wrapped_samples;
-
-    /// Native pointer for input samples (possibly NULL if unavailable)
-    SampleT* d_native_samples;
-
-    /// The number of output bins for each channel
-    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// The number of privatized bins for each channel
-    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to gmem privatized histograms for each channel
-    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to final output histograms (gmem)
-    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining privatized counter indices from samples, one for each channel
-    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
-
-
-    //---------------------------------------------------------------------
-    // Initialize privatized bin counters
-    //---------------------------------------------------------------------
-
-    // Initialize privatized bin counters
-    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
-            {
-                privatized_histograms[CHANNEL][privatized_bin] = 0;
-            }
-        }
-
-        // Barrier to make sure all threads are done updating counters
-        __syncthreads();
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void InitSmemBinCounters()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        InitBinCounters(privatized_histograms);
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void InitGmemBinCounters()
-    {
-        InitBinCounters(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Update final output histograms
-    //---------------------------------------------------------------------
-
-    // Update final output histograms from privatized histograms
-    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Barrier to make sure all threads are done updating counters
-        __syncthreads();
-
-        // Apply privatized bin counts to output bin counts
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
-                    privatized_bin += BLOCK_THREADS)
-            {
-                int         output_bin;
-                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
-                bool        is_valid    = count > 0;
-                output_decode_op[CHANNEL].BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
-
-                if (output_bin >= 0)
-                {
-                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
-                }
-
-            }
-        }
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void StoreSmemOutput()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        StoreOutput(privatized_histograms);
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void StoreGmemOutput()
-    {
-        StoreOutput(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Accumulate privatized histograms
-    //---------------------------------------------------------------------
-
-    // Accumulate pixels.  Specialized for RLE compression.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<true>      is_rle_compress)
-    {
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            // Bin pixels
-            int bins[PIXELS_PER_THREAD];
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-                privatized_decode_op[CHANNEL].BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
-            
-            CounterT accumulator = 1;
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
-            {
-                
-                if (bins[PIXEL] < 0)
-                {
-                     accumulator = 1;
-                } 
-                else if (bins[PIXEL] == bins[PIXEL + 1])
-                {   
-                     accumulator++;
-                } 
-                else
-                {
-                     atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
-                     accumulator = 1;
-                }
-            }
-
-            // Last pixel
-            if (bins[PIXELS_PER_THREAD - 1] >= 0)
-                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
-
-        }
-    }
-
-
-    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<false>     is_rle_compress)
-    {
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                int bin;
-                privatized_decode_op[CHANNEL].BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
-                if (bin >= 0)
-                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
-            }
-        }
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for smem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateSmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<RLE_COMPRESS>());
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for gmem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateGmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<RLE_COMPRESS>());
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    // Load full, aligned tile using pixel iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples));
-    }
-
-    // Load full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        // Load using sample iterator
-        BlockLoadSampleT(temp_storage.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples));
-    }
-
-    // Load partially-full, aligned tile using pixel iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        int valid_pixels = valid_samples / NUM_CHANNELS;
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples),
-            valid_pixels);
-
-    }
-
-    // Load partially-full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        BlockLoadSampleT(temp_storage.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples),
-            valid_samples);
-    }
-
-    // Consume a tile of data samples
-    template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
-        bool IS_FULL_TILE>      // Whether the tile is full
-    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
-    {
-        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
-        bool        is_valid[PIXELS_PER_THREAD];
-
-        // Load tile
-        LoadTile(
-            block_offset,
-            valid_samples,
-            samples,
-            Int2Type<IS_FULL_TILE>(),
-            Int2Type<IS_ALIGNED>());
-
-        // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
-
-        // Accumulate samples
-        if (prefer_smem)
-            AccumulateSmemPixels(samples, is_valid);
-        else
-            AccumulateGmemPixels(samples, is_valid);
-
-
-    }
-
-
-    // Consume row tiles.  Specialized for work-stealing from queue
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             row_offset,
-        OffsetT             row_end,
-        int                 tiles_per_row,
-        GridQueue<int>      tile_queue,
-        Int2Type<true>      is_work_stealing)
-    {
-        OffsetT tile_offset = blockIdx.x * TILE_SAMPLES;
-        OffsetT num_remaining = row_end - tile_offset;
-        OffsetT even_share_base = gridDim.x * TILE_SAMPLES;
-
-        while (num_remaining >= TILE_SAMPLES)
-        {
-            // Consume full tile
-            ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-
-            __syncthreads();
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_offset = tile_queue.Drain(TILE_SAMPLES) + even_share_base;
-
-            __syncthreads();
-
-            tile_offset     = temp_storage.tile_offset;
-            num_remaining   = row_end - tile_offset;
-
-        }
-        
-        if (num_remaining > 0)
-        {
-            // Consume the last (and potentially partially-full) tile
-            ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-        }
-    }
-
-
-    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             row_offset,
-        OffsetT             row_end,
-        int                 tiles_per_row,
-        GridQueue<int>      tile_queue,
-        Int2Type<false>     is_work_stealing)
-    {
-        OffsetT tile_offset = row_offset + (blockIdx.x * TILE_SAMPLES);
-        while (tile_offset + TILE_SAMPLES <= row_end)
-        {
-            ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-            tile_offset += gridDim.x * TILE_SAMPLES;
-        }
-
-        if (tile_offset < row_end)
-        {
-            int valid_samples = row_end - tile_offset;
-            ConsumeTile<IS_ALIGNED, false>(tile_offset, valid_samples);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Parameter extraction
-    //---------------------------------------------------------------------
-
-    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
-    template <
-        CacheLoadModifier   _MODIFIER,
-        typename            _ValueT,
-        typename            _OffsetT>
-    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
-    {
-        return itr.ptr;
-    }
-
-    // Return a native pixel pointer (specialized for other types)
-    template <typename IteratorT>
-    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
-    {
-        return NULL;
-    }
-
+    MatrixValueIteratorT            d_matrix_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    MatrixRowOffsetsIteratorT       d_matrix_row_offsets;       ///< Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_matrix_column_indices and \p d_matrix_values (with the final entry being equal to \p num_nonzeros)
+    MatrixColumnIndicesIteratorT    d_matrix_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            d_vector_x;                 ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*                         d_vector_y;                 ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    VertexT*                        d_block_carryout_rows;      ///< Pointer to the temporary array carry-out dot product row-ids, one per block
+    ValueT*                         d_block_runout_values;      ///< Pointer to the temporary array carry-out dot product partial-sums, one per block
+    int                             num_rows;                   ///< The number of rows of matrix <b>A</b>.
+    int                             num_cols;                   ///< The number of columns of matrix <b>A</b>.
+    int                             num_nonzeros;               ///< The number of nonzero elements of matrix <b>A</b>.
 
 
     //---------------------------------------------------------------------
     // Interface
     //---------------------------------------------------------------------
 
-    bool prefer_smem;
-
     /**
      * Constructor
      */
     __device__ __forceinline__ BlockSpmvSweep(
-        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
-        SampleIteratorT     d_samples,                                          ///< Input data to reduce
-        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
-        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
-        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
-        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
-        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+        TempStorage &temp_storage,              ///< Reference to temp_storage
+        ValueT*     d_matrix_values,            ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        OffsetT*    d_matrix_row_offsets,       ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_matrix_column_indices and \p d_matrix_values (with the final entry being equal to \p num_nonzeros)
+        VertexT*    d_matrix_column_indices,    ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*     d_vector_x,                 ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*     d_vector_y,                 ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        VertexT*    d_block_carryout_rows,      ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        ValueT*     d_block_runout_values,      ///< [out] Pointer to the temporary array carry-out dot product partial-sums, one per block
+        int         num_rows,                   ///< [in] number of rows of matrix <b>A</b>.
+        int         num_cols,                   ///< [in] number of columns of matrix <b>A</b>.
+        int         num_nonzeros)               ///< [in] number of nonzero elements of matrix <b>A</b>.
     :
         temp_storage(temp_storage.Alias()),
         d_wrapped_samples(d_samples),
diff --git a/cub/device/dispatch/device_spmv_dispatch.cuh b/cub/device/dispatch/device_spmv_dispatch.cuh
index f54424af08..1121ee0446 100644
--- a/cub/device/dispatch/device_spmv_dispatch.cuh
+++ b/cub/device/dispatch/device_spmv_dispatch.cuh
@@ -39,8 +39,6 @@
 #include <limits>
 
 #include "../../block_sweep/block_spmv_sweep.cuh"
-#include "../device_radix_sort.cuh"
-#include "../../iterator/tex_ref_input_iterator.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
 #include "../../thread/thread_search.cuh"
@@ -83,13 +81,8 @@ __global__ void DeviceSpmvSweepKernel(
     // Thread block type for compositing input tiles
     typedef BlockSpmvSweep<
             BlockSpmvSweepPolicyT,
-            PRIVATIZED_SMEM_BINS,
-            NUM_CHANNELS,
-            NUM_ACTIVE_CHANNELS,
-            SampleIteratorT,
-            CounterT,
-            PrivatizedDecodeOpT,
-            OutputDecodeOpT,
+            VertexT,
+            ValueT,
             OffsetT>
         BlockSpmvSweepT;
 
@@ -106,19 +99,13 @@ __global__ void DeviceSpmvSweepKernel(
         output_decode_op_wrapper.array,
         privatized_decode_op_wrapper.array);
 
-    // Initialize counters
-    block_sweep.InitBinCounters();
-
-    // Consume input tiles
+    // Consume input
     block_sweep.ConsumeTiles(
         num_row_pixels,
         num_rows,
         row_stride_samples,
         tiles_per_row,
         tile_queue);
-
-    // Store output to global (if necessary)
-    block_sweep.StoreOutput();
 }