Skip to content
This repository has been archived by the owner on Apr 3, 2024. It is now read-only.

Commit

Permalink
Changes necessary to support Feta.
Browse files Browse the repository at this point in the history
Reviewed-by: Bryce Adelstein Lelbach aka wash <[email protected]>

Bug 2839527
  • Loading branch information
dkolsen-pgi authored and brycelelbach committed Apr 2, 2020
1 parent 5102543 commit 35e4f69
Show file tree
Hide file tree
Showing 15 changed files with 513 additions and 389 deletions.
92 changes: 44 additions & 48 deletions cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
#include "../../grid/grid_queue.cuh"
#include "../../config.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -455,41 +457,46 @@ struct DipatchHistogram
int ptx_version,
KernelConfig &histogram_sweep_config)
{
#if (CUB_PTX_ARCH > 0)

// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();

#else

// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 500)
{
return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
}
else if (ptx_version >= 350)
{
return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
}
else if (ptx_version >= 300)
cudaError_t result = cudaErrorNotSupported;
if (CUB_IS_DEVICE_CODE)
{
return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
}
else if (ptx_version >= 200)
{
return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
}
else if (ptx_version >= 110)
{
return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
#if CUB_INCLUDE_DEVICE_CODE
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
#endif
}
else
{
// No global atomic support
return cudaErrorNotSupported;
#if CUB_INCLUDE_HOST_CODE
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 500)
{
result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
}
else if (ptx_version >= 350)
{
result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
}
else if (ptx_version >= 300)
{
result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
}
else if (ptx_version >= 200)
{
result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
}
else if (ptx_version >= 110)
{
result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
}
else
{
// No global atomic support
result = cudaErrorNotSupported;
}
#endif
}

#endif
return result;
}


Expand Down Expand Up @@ -654,7 +661,10 @@ struct DipatchHistogram
histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);

// Invoke histogram_init_kernel
histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
histogram_init_grid_dims, histogram_init_block_threads, 0,
stream
).doit(histogram_init_kernel,
num_output_bins_wrapper,
d_output_histograms_wrapper,
tile_queue);
Expand All @@ -669,7 +679,9 @@ struct DipatchHistogram
histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);

// Invoke histogram_sweep_kernel
histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
).doit(histogram_sweep_kernel,
d_samples,
num_output_bins_wrapper,
num_privatized_bins_wrapper,
Expand Down Expand Up @@ -722,11 +734,7 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -830,11 +838,7 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -913,11 +917,7 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down Expand Up @@ -1025,11 +1025,7 @@ struct DipatchHistogram
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif

// Get kernel dispatch configurations
KernelConfig histogram_sweep_config;
Expand Down
25 changes: 20 additions & 5 deletions cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include "../../util_debug.cuh"
#include "../../util_device.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -922,7 +924,9 @@ struct DispatchRadixSort :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);

// Invoke upsweep_kernel with same grid size as downsweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_keys.Current(),
d_keys.Alternate(),
d_values.Current(),
Expand Down Expand Up @@ -980,7 +984,10 @@ struct DispatchRadixSort :
pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);

// Invoke upsweep_kernel with same grid size as downsweep_kernel
pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
pass_config.even_share.grid_size,
pass_config.upsweep_config.block_threads, 0, stream
).doit(pass_config.upsweep_kernel,
d_keys_in,
d_spine,
num_items,
Expand All @@ -999,7 +1006,9 @@ struct DispatchRadixSort :
1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);

// Invoke scan_kernel
pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, pass_config.scan_config.block_threads, 0, stream
).doit(pass_config.scan_kernel,
d_spine,
spine_length);

Expand All @@ -1015,7 +1024,10 @@ struct DispatchRadixSort :
pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);

// Invoke downsweep_kernel
pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
pass_config.even_share.grid_size,
pass_config.downsweep_config.block_threads, 0, stream
).doit(pass_config.downsweep_kernel,
d_keys_in,
d_keys_out,
d_values_in,
Expand Down Expand Up @@ -1429,7 +1441,10 @@ struct DispatchSegmentedRadixSort :
pass_bits);
}

pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
num_segments, pass_config.segmented_config.block_threads, 0,
stream
).doit(pass_config.segmented_kernel,
d_keys_in, d_keys_out,
d_values_in, d_values_out,
d_begin_offsets, d_end_offsets, num_segments,
Expand Down
20 changes: 16 additions & 4 deletions cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include "../../util_debug.cuh"
#include "../../util_device.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -450,7 +452,9 @@ struct DispatchReduce :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

// Invoke single_reduce_sweep_kernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_in,
d_out,
num_items,
Expand Down Expand Up @@ -544,7 +548,10 @@ struct DispatchReduce :
reduce_config.sm_occupancy);

// Invoke DeviceReduceKernel
reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
0, stream
).doit(reduce_kernel,
d_in,
d_block_reductions,
num_items,
Expand All @@ -564,7 +571,9 @@ struct DispatchReduce :
ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);

// Invoke DeviceReduceSingleTileKernel
single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
).doit(single_tile_kernel,
d_block_reductions,
d_out,
reduce_grid_size,
Expand Down Expand Up @@ -778,7 +787,10 @@ struct DispatchSegmentedReduce :
segmented_reduce_config.sm_occupancy);

// Invoke DeviceReduceKernel
segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
num_segments,
ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
).doit(segmented_reduce_kernel,
d_in,
d_out,
d_begin_offsets,
Expand Down
70 changes: 38 additions & 32 deletions cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include "../../grid/grid_queue.cuh"
#include "../../util_device.cuh"

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

/// Optional outer namespace(s)
CUB_NS_PREFIX

Expand Down Expand Up @@ -284,37 +286,40 @@ struct DispatchReduceByKey
int ptx_version,
KernelConfig &reduce_by_key_config)
{
#if (CUB_PTX_ARCH > 0)
(void)ptx_version;

// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();

#else

// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 350)
{
reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 300)
if (CUB_IS_DEVICE_CODE)
{
reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 200)
{
reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 130)
{
reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
#if CUB_INCLUDE_DEVICE_CODE
(void)ptx_version;
// We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
#endif
}
else
{
reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
#if CUB_INCLUDE_HOST_CODE
// We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
if (ptx_version >= 350)
{
reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 300)
{
reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 200)
{
reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
}
else if (ptx_version >= 130)
{
reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
}
else
{
reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
}
#endif
}

#endif
}


Expand Down Expand Up @@ -428,7 +433,9 @@ struct DispatchReduceByKey
if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);

// Invoke init_kernel to initialize tile descriptors
init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
init_grid_size, INIT_KERNEL_THREADS, 0, stream
).doit(init_kernel,
tile_state,
num_tiles,
d_num_runs_out);
Expand Down Expand Up @@ -463,7 +470,10 @@ struct DispatchReduceByKey
start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);

// Invoke reduce_by_key_kernel
reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
thrust::cuda_cub::launcher::triple_chevron(
scan_grid_size, reduce_by_key_config.block_threads, 0,
stream
).doit(reduce_by_key_kernel,
d_keys_in,
d_unique_out,
d_values_in,
Expand Down Expand Up @@ -513,11 +523,7 @@ struct DispatchReduceByKey
{
// Get PTX version
int ptx_version = 0;
#if (CUB_PTX_ARCH == 0)
if (CubDebug(error = PtxVersion(ptx_version))) break;
#else
ptx_version = CUB_PTX_ARCH;
#endif

// Get kernel kernel dispatch configurations
KernelConfig reduce_by_key_config;
Expand Down
Loading

0 comments on commit 35e4f69

Please sign in to comment.