Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
- Added sm52 scan tuning
Browse files Browse the repository at this point in the history
- Added fast in-core processing for very small sorting problems

Former-commit-id: 7702e4dd652a9cd43e93f168d3522dc9a06d1ebe
  • Loading branch information
dumerrill committed Feb 23, 2015
1 parent 6330f93 commit ce261ed
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 197 deletions.
4 changes: 4 additions & 0 deletions cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,9 @@ struct AgentRadixSortDownsweep
GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
}

//---------------------------------------------------------------------
// Copy shortcut
//---------------------------------------------------------------------

/**
* Copy tiles within the range of input
Expand Down Expand Up @@ -728,6 +731,7 @@ struct AgentRadixSortDownsweep
}
}
}

};


Expand Down
39 changes: 0 additions & 39 deletions cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,6 @@ struct AgentScan
GridQueue<int> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
ScanTileState &tile_status) ///< Global list of tile status
{
#if (CUB_PTX_ARCH <= 130)
// Blocks are launched in increasing order, so just assign one tile per block

int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; // Current tile index
Expand All @@ -378,44 +377,6 @@ struct AgentScan
ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
else if (num_remaining > 0)
ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);

#else
// Blocks may not be launched in increasing order, so work-steal tiles

// Get first tile index
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);

__syncthreads();

int tile_idx = temp_storage.tile_idx;
OffsetT block_offset = TILE_ITEMS * tile_idx;
OffsetT num_remaining = num_items - block_offset;

while (num_remaining >= TILE_ITEMS)
{
// Consume full tile
ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);

// Get next tile
if (threadIdx.x == 0)
temp_storage.tile_idx = queue.Drain(1);

__syncthreads();

tile_idx = temp_storage.tile_idx;
block_offset = TILE_ITEMS * tile_idx;
num_remaining = num_items - block_offset;
}

// Consume the last (and potentially partially-full) tile
if (num_remaining > 0)
{
ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
}

#endif

}


Expand Down
8 changes: 5 additions & 3 deletions cub/block/block_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ private:
}
}

public:

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

/// Sort blocked -> striped arrangement
template <int DESCENDING, int KEYS_ONLY>
__device__ __forceinline__ void SortBlockedToStriped(
Expand Down Expand Up @@ -394,9 +398,7 @@ private:
}
}



public:
#endif // DOXYGEN_SHOULD_SKIP_THIS

/// \smemstorage{BlockScan}
struct TempStorage : Uninitialized<_TempStorage> {};
Expand Down
Loading

0 comments on commit ce261ed

Please sign in to comment.