Skip to content

Commit

Permalink
Changes necessary to support Feta.
Browse files Browse the repository at this point in the history
Reviewed-by: Bryce Adelstein Lelbach aka wash <[email protected]>
  • Loading branch information
dkolsen-pgi authored and brycelelbach committed May 19, 2020
1 parent 6552e4d commit 1caaac1
Show file tree
Hide file tree
Showing 15 changed files with 528 additions and 394 deletions.
4 changes: 2 additions & 2 deletions cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,14 @@ struct DeviceReduce
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output aggregate
int num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
typename thrust::iterator_traits<InputIteratorT>::difference_type num_items, ///< [in] Total number of input items (i.e., length of \p d_in)
ReductionOpT reduction_op, ///< [in] Binary reduction functor
T init, ///< [in] Initial value of the reduction
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;
typedef typename thrust::iterator_traits<InputIteratorT>::difference_type OffsetT;

return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
d_temp_storage,
Expand Down
116 changes: 64 additions & 52 deletions cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
#include "../../grid/grid_queue.cuh"
#include "../../util_namespace.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -455,41 +457,46 @@ struct DipatchHistogram
int ptx_version,
KernelConfig &histogram_sweep_config)
{
#if (CUB_PTX_ARCH > 0)

// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();

#else

// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 500)
{
return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
}
else if (ptx_version >= 350)
{
return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
}
else if (ptx_version >= 300)
{
return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
}
else if (ptx_version >= 200)
cudaError_t result = cudaErrorNotSupported;
if (CUB_IS_DEVICE_CODE)
{
return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
}
else if (ptx_version >= 110)
{
return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
#if CUB_INCLUDE_DEVICE_CODE
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
#endif
}
else
{
// No global atomic support
return cudaErrorNotSupported;
#if CUB_INCLUDE_HOST_CODE
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 500)
{
result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
}
else if (ptx_version >= 350)
{
result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
}
else if (ptx_version >= 300)
{
result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
}
else if (ptx_version >= 200)
{
result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
}
else if (ptx_version >= 110)
{
result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
}
else
{
// No global atomic support
result = cudaErrorNotSupported;
}
#endif
}

#endif
return result;
}


Expand Down Expand Up @@ -654,7 +661,10 @@ struct DipatchHistogram
histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);

// Invoke histogram_init_kernel
histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
histogram_init_grid_dims, histogram_init_block_threads, 0,
stream
).doit(histogram_init_kernel,
num_output_bins_wrapper,
d_output_histograms_wrapper,
tile_queue);
Expand All @@ -669,7 +679,9 @@ struct DipatchHistogram
histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);

// Invoke histogram_sweep_kernel
histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
).doit(histogram_sweep_kernel,
d_samples,
num_output_bins_wrapper,
num_privatized_bins_wrapper,
Expand Down Expand Up @@ -722,11 +734,11 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
if (CUB_IS_HOST_CODE) {
if (CubDebug(error = PtxVersion(ptx_version))) break;
} else {
ptx_version = CUB_PTX_ARCH;
}

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -830,11 +842,11 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
if (CUB_IS_HOST_CODE) {
if (CubDebug(error = PtxVersion(ptx_version))) break;
} else {
ptx_version = CUB_PTX_ARCH;
}

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -913,11 +925,11 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
if (CUB_IS_HOST_CODE) {
if (CubDebug(error = PtxVersion(ptx_version))) break;
} else {
ptx_version = CUB_PTX_ARCH;
}

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -1025,11 +1037,11 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif
if (CUB_IS_HOST_CODE) {
if (CubDebug(error = PtxVersion(ptx_version))) break;
} else {
ptx_version = CUB_PTX_ARCH;
}

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down
25 changes: 20 additions & 5 deletions cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -921,7 +923,9 @@ struct DispatchRadixSort :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);

// Invoke upsweep_kernel with same grid size as downsweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_keys.Current(),
d_keys.Alternate(),
d_values.Current(),
Expand Down Expand Up @@ -979,7 +983,10 @@ struct DispatchRadixSort :
pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);

// Invoke upsweep_kernel with same grid size as downsweep_kernel
pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
pass_config.even_share.grid_size,
pass_config.upsweep_config.block_threads, 0, stream
).doit(pass_config.upsweep_kernel,
d_keys_in,
d_spine,
num_items,
Expand All @@ -998,7 +1005,9 @@ struct DispatchRadixSort :
1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);

// Invoke scan_kernel
pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, pass_config.scan_config.block_threads, 0, stream
).doit(pass_config.scan_kernel,
d_spine,
spine_length);

Expand All @@ -1014,7 +1023,10 @@ struct DispatchRadixSort :
pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);

// Invoke downsweep_kernel
pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
pass_config.even_share.grid_size,
pass_config.downsweep_config.block_threads, 0, stream
).doit(pass_config.downsweep_kernel,
d_keys_in,
d_keys_out,
d_values_in,
Expand Down Expand Up @@ -1418,7 +1430,10 @@ struct DispatchSegmentedRadixSort :
num_segments, pass_config.segmented_config.block_threads, (long long) stream,
pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);

pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
num_segments, pass_config.segmented_config.block_threads, 0,
stream
).doit(pass_config.segmented_kernel,
d_keys_in, d_keys_out,
d_values_in, d_values_out,
d_begin_offsets, d_end_offsets, num_segments,
Expand Down
20 changes: 16 additions & 4 deletions cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include "../../util_device.cuh"
#include "../../util_namespace.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -449,7 +451,9 @@ struct DispatchReduce :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

// Invoke single_reduce_sweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_in,
d_out,
num_items,
Expand Down Expand Up @@ -543,7 +547,10 @@ struct DispatchReduce :
reduce_config.sm_occupancy);

// Invoke DeviceReduceKernel
reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
0, stream
).doit(reduce_kernel,
d_in,
d_block_reductions,
num_items,
Expand All @@ -563,7 +570,9 @@ struct DispatchReduce :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

// Invoke DeviceReduceSingleTileKernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_block_reductions,
d_out,
reduce_grid_size,
Expand Down Expand Up @@ -776,7 +785,10 @@ struct DispatchSegmentedReduce :
segmented_reduce_config.sm_occupancy);

// Invoke DeviceReduceKernel
segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
0, stream
).doit(segmented_reduce_kernel,
d_in,
d_out,
d_begin_offsets,
Expand Down
Loading

0 comments on commit 1caaac1

Please sign in to comment.