From 1caaac18483cfae068ad8e4aed1780a27c6be0b9 Mon Sep 17 00:00:00 2001 From: David Olsen Date: Mon, 6 Apr 2020 15:32:17 -0700 Subject: [PATCH] Changes necessary to support Feta. Reviewed-by: Bryce Adelstein Lelbach aka wash --- cub/device/device_reduce.cuh | 4 +- cub/device/dispatch/dispatch_histogram.cuh | 116 ++++++------ cub/device/dispatch/dispatch_radix_sort.cuh | 25 ++- cub/device/dispatch/dispatch_reduce.cuh | 20 +- .../dispatch/dispatch_reduce_by_key.cuh | 76 ++++---- cub/device/dispatch/dispatch_rle.cuh | 78 ++++---- cub/device/dispatch/dispatch_scan.cuh | 81 +++++---- cub/device/dispatch/dispatch_select_if.cuh | 76 ++++---- cub/device/dispatch/dispatch_spmv_orig.cuh | 138 ++++++++------ cub/iterator/tex_obj_input_iterator.cuh | 42 +++-- cub/iterator/tex_ref_input_iterator.cuh | 20 +- cub/util_arch.cuh | 27 ++- cub/util_debug.cuh | 24 ++- cub/util_device.cuh | 171 ++++++++++-------- cub/util_ptx.cuh | 24 --- 15 files changed, 528 insertions(+), 394 deletions(-) diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh index 13c7a72d1a..a3b6ca1497 100644 --- a/cub/device/device_reduce.cuh +++ b/cub/device/device_reduce.cuh @@ -150,14 +150,14 @@ struct DeviceReduce size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + typename thrust::iterator_traits::difference_type num_items, ///< [in] Total number of input items (i.e., length of \p d_in) ReductionOpT reduction_op, ///< [in] Binary reduction functor T init, ///< [in] Initial value of the reduction cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int OffsetT; + typedef typename thrust::iterator_traits::difference_type OffsetT; return DispatchReduce::Dispatch( d_temp_storage, diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index 059c881228..76461935a9 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -45,6 +45,8 @@ #include "../../grid/grid_queue.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -455,41 +457,46 @@ struct DipatchHistogram int ptx_version, KernelConfig &histogram_sweep_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - return histogram_sweep_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 350) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 300) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 200) + cudaError_t result = cudaErrorNotSupported; + if (CUB_IS_DEVICE_CODE) { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 110) - { - return histogram_sweep_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + result = histogram_sweep_config.template Init(); + #endif } else { - // No global atomic support - return cudaErrorNotSupported; + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + result = histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + result = cudaErrorNotSupported; + } + #endif } - - #endif + return result; } @@ -654,7 +661,10 @@ struct DipatchHistogram histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); // Invoke histogram_init_kernel - histogram_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + histogram_init_grid_dims, histogram_init_block_threads, 0, + stream + ).doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); @@ -669,7 +679,9 @@ struct DipatchHistogram histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); // Invoke histogram_sweep_kernel - histogram_sweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream + ).doit(histogram_sweep_kernel, d_samples, num_output_bins_wrapper, num_privatized_bins_wrapper, @@ -722,11 +734,11 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -830,11 +842,11 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -913,11 +925,11 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -1025,11 +1037,11 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel dispatch configurations KernelConfig histogram_sweep_config; diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 60ba9fe8ff..cb6ceebbd9 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -47,6 +47,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -921,7 +923,9 @@ struct DispatchRadixSort : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); // Invoke upsweep_kernel with same grid size as downsweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_keys.Current(), d_keys.Alternate(), d_values.Current(), @@ -979,7 +983,10 @@ struct DispatchRadixSort : pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); // Invoke upsweep_kernel with same grid size as downsweep_kernel - pass_config.upsweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + pass_config.even_share.grid_size, + pass_config.upsweep_config.block_threads, 0, stream + ).doit(pass_config.upsweep_kernel, d_keys_in, d_spine, num_items, @@ -998,7 +1005,9 @@ struct DispatchRadixSort : 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); // Invoke scan_kernel - pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, pass_config.scan_config.block_threads, 0, stream + ).doit(pass_config.scan_kernel, d_spine, spine_length); @@ -1014,7 +1023,10 @@ struct DispatchRadixSort : pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); // Invoke downsweep_kernel - pass_config.downsweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + pass_config.even_share.grid_size, + pass_config.downsweep_config.block_threads, 0, stream + ).doit(pass_config.downsweep_kernel, d_keys_in, d_keys_out, d_values_in, @@ -1418,7 +1430,10 @@ struct DispatchSegmentedRadixSort : num_segments, pass_config.segmented_config.block_threads, (long long) stream, pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); - pass_config.segmented_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + num_segments, pass_config.segmented_config.block_threads, 0, + stream + ).doit(pass_config.segmented_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_begin_offsets, d_end_offsets, num_segments, diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 14557c6087..1cb8dbdd1c 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -46,6 +46,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -449,7 +451,9 @@ struct DispatchReduce : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); // Invoke single_reduce_sweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_in, d_out, num_items, @@ -543,7 +547,10 @@ struct DispatchReduce : reduce_config.sm_occupancy); // Invoke DeviceReduceKernel - reduce_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, + 0, stream + ).doit(reduce_kernel, d_in, d_block_reductions, num_items, @@ -563,7 +570,9 @@ struct DispatchReduce : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); // Invoke DeviceReduceSingleTileKernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_block_reductions, d_out, reduce_grid_size, @@ -776,7 +785,10 @@ struct DispatchSegmentedReduce : segmented_reduce_config.sm_occupancy); // Invoke DeviceReduceKernel - segmented_reduce_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + 0, stream + ).doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index a494925c7c..6d0b425e9f 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -44,6 +44,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -284,37 +286,40 @@ struct DispatchReduceByKey int ptx_version, KernelConfig &reduce_by_key_config) { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 300) + if (CUB_IS_DEVICE_CODE) { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_by_key_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + #endif } else { - reduce_by_key_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + #endif } - - #endif } @@ -428,7 +433,9 @@ struct DispatchReduceByKey if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( + thurst::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles, d_num_runs_out); @@ -463,7 +470,10 @@ struct DispatchReduceByKey start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); // Invoke reduce_by_key_kernel - reduce_by_key_kernel<<>>( + thurst::cuda_cub::launcher::triple_chevron( + scan_grid_size, reduce_by_key_config.block_threads, 0, + stream + ).doit(reduce_by_key_kernel, d_keys_in, d_unique_out, d_values_in, @@ -513,11 +523,11 @@ struct DispatchReduceByKey { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel kernel dispatch configurations KernelConfig reduce_by_key_config; diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index 0a094f6c6a..16cddef6ce 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -44,6 +44,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -274,36 +276,36 @@ struct DeviceRleDispatch int ptx_version, KernelConfig& device_rle_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - device_rle_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 300) - { - device_rle_config.template Init(); + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + #endif + } else { + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + #endif } - else if (ptx_version >= 200) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 130) - { - device_rle_config.template Init(); - } - else - { - device_rle_config.template Init(); - } - - #endif } @@ -415,7 +417,9 @@ struct DeviceRleDispatch if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors - device_scan_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); @@ -452,7 +456,9 @@ struct DeviceRleDispatch scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); // Invoke device_rle_sweep_kernel - device_rle_sweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, device_rle_config.block_threads, 0, stream + ).doit(device_rle_sweep_kernel, d_in, d_offsets_out, d_lengths_out, @@ -498,11 +504,11 @@ struct DeviceRleDispatch { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel kernel dispatch configurations KernelConfig device_rle_config; diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index dc5bbaadcb..d86d98257e 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -45,6 +45,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -311,45 +313,48 @@ struct DispatchScan int ptx_version, KernelConfig &scan_kernel_config) { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - scan_kernel_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 520) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 350) + if (CUB_IS_DEVICE_CODE) { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 300) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 200) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 130) - { - scan_kernel_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + #endif } else { - scan_kernel_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + #endif } - - #endif } @@ -460,7 +465,9 @@ struct DispatchScan if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles); @@ -490,7 +497,9 @@ struct DispatchScan start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); // Invoke scan_kernel - scan_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, scan_kernel_config.block_threads, 0, stream + ).doit(scan_kernel, d_in, d_out, tile_state, diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index 0fd622a35e..f42eed804d 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -44,6 +44,8 @@ #include "../../util_device.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -275,37 +277,39 @@ struct DispatchSelectIf int ptx_version, KernelConfig &select_if_config) { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_if_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - select_if_config.template Init(); - } - else if (ptx_version >= 300) - { - select_if_config.template Init(); - } - else if (ptx_version >= 200) - { - select_if_config.template Init(); - } - else if (ptx_version >= 130) - { - select_if_config.template Init(); + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + #endif } else { - select_if_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + #endif } - - #endif } @@ -417,7 +421,9 @@ struct DispatchSelectIf if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke scan_init_kernel to initialize tile descriptors - scan_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out); @@ -454,7 +460,9 @@ struct DispatchSelectIf scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); // Invoke select_if_kernel - select_if_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, select_if_config.block_threads, 0, stream + ).doit(select_if_kernel, d_in, d_flags, d_selected_out, @@ -501,11 +509,11 @@ struct DispatchSelectIf { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel kernel dispatch configurations KernelConfig select_if_config; diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index b43190980d..85acde4c18 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -47,6 +47,8 @@ #include "../../grid/grid_queue.cuh" #include "../../util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -488,53 +490,55 @@ struct DispatchSpmv KernelConfig &spmv_config, KernelConfig &segment_fixup_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - spmv_config.template Init(); - segment_fixup_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 500) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 370) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 350) + if (CUB_IS_DEVICE_CODE) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 300) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - - } - else if (ptx_version >= 200) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + #endif } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + #endif } - - #endif } @@ -614,7 +618,10 @@ struct DispatchSpmv degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); // Invoke spmv_search_kernel - spmv_1col_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, + stream + ).doit(spmv_1col_kernel, spmv_params); // Check for failure to launch @@ -700,10 +707,13 @@ struct DispatchSpmv int search_block_size = INIT_KERNEL_THREADS; int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; -#if (CUB_PTX_ARCH == 0) - // Init textures - if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; -#endif + #if CUB_INCLUDE_HOST_CODE + if (CUB_IS_HOST_CODE) + { + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; + } + #endif if (search_grid_size < sm_count) // if (num_merge_tiles < spmv_sm_occupancy * sm_count) @@ -720,7 +730,9 @@ struct DispatchSpmv search_grid_size, search_block_size, (long long) stream); // Invoke spmv_search_kernel - spmv_search_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + search_grid_size, search_block_size, 0, stream + ).doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); @@ -737,7 +749,9 @@ struct DispatchSpmv spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); // Invoke spmv_kernel - spmv_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + spmv_grid_size, spmv_config.block_threads, 0, stream + ).doit(spmv_kernel, spmv_params, d_tile_coordinates, d_tile_carry_pairs, @@ -759,7 +773,10 @@ struct DispatchSpmv segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); // Invoke segment_fixup_kernel - segment_fixup_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + segment_fixup_grid_size, segment_fixup_config.block_threads, + 0, stream + ).doit(segment_fixup_kernel, d_tile_carry_pairs, spmv_params.d_vector_y, num_merge_tiles, @@ -773,10 +790,13 @@ struct DispatchSpmv if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; } -#if (CUB_PTX_ARCH == 0) - // Free textures - if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; -#endif + #if CUB_INCLUDE_HOST_CODE + if (CUB_IS_HOST_CODE) + { + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; + } + #endif } while (0); @@ -802,11 +822,11 @@ struct DispatchSpmv { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif + if (CUB_IS_HOST_CODE) { + if (CubDebug(error = PtxVersion(ptx_version))) break; + } else { + ptx_version = CUB_PTX_ARCH; + } // Get kernel kernel dispatch configurations KernelConfig spmv_config, segment_fixup_config; diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh index aed3c7e779..d7da4b6925 100644 --- a/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/iterator/tex_obj_input_iterator.cuh @@ -204,24 +204,32 @@ public: /// Indirection __host__ __device__ __forceinline__ reference operator*() const { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Move array of uninitialized words, then alias and assign to return value - TextureWord words[TEXTURE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch( - tex_obj, - (tex_offset * TEXTURE_MULTIPLE) + i); + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + // Simply dereference the pointer on the host + return ptr[tex_offset]; + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); + #else + // This is dead code which will never be executed. It is here + // only to avoid warnings about missing return statements. + return *ptr; + #endif } - - // Load from words - return *reinterpret_cast(words); -#endif } /// Addition diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh index 1479e41a9d..9e0c6204a4 100644 --- a/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/iterator/tex_ref_input_iterator.cuh @@ -279,13 +279,19 @@ public: /// Indirection __host__ __device__ __forceinline__ reference operator*() const { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Use the texture reference - return TexId::Fetch(tex_offset); -#endif + if (CUB_IS_HOST_CODE) { + // Simply dereference the pointer on the host + return ptr[tex_offset]; + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Use the texture reference + return TexId::Fetch(tex_offset); + #else + // This is dead code that will never be executed. It is here + // only to avoid warnings about missing returns. + return *ptr; + #endif + } } /// Addition diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index fb2df4fde2..d250457305 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -44,19 +44,42 @@ namespace cub { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) +#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__PGI_CUDA__)) && \ + !defined(CUB_USE_COOPERATIVE_GROUPS) #define CUB_USE_COOPERATIVE_GROUPS #endif /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). #ifndef CUB_PTX_ARCH - #ifndef __CUDA_ARCH__ + #if defined(__PGI_CUDA__) + // __PGI_CUDA_ARCH__ is the minimum supported SM target. It is defined + // with the same value when compiling both host code and device code. + #define CUB_PTX_ARCH __PGI_CUDA_ARCH__ + #elif !defined(__CUDA_ARCH__) #define CUB_PTX_ARCH 0 #else #define CUB_PTX_ARCH __CUDA_ARCH__ #endif #endif +#ifndef CUB_IS_DEVICE_CODE + #if defined(__PGI_CUDA__) + #define CUB_IS_DEVICE_CODE __builtin_is_device_code() + #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 1 + #elif CUB_PTX_ARCH > 0 + #define CUB_IS_DEVICE_CODE 1 + #define CUB_IS_HOST_CODE 0 + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 0 + #else + #define CUB_IS_DEVICE_CODE 0 + #define CUB_IS_HOST_CODE 1 + #define CUB_INCLUDE_DEVICE_CODE 0 + #define CUB_INCLUDE_HOST_CODE 1 + #endif +#endif /// Maximum number of devices supported. #ifndef CUB_MAX_DEVICES diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh index acbc1df10e..c79b4f0953 100644 --- a/cub/util_debug.cuh +++ b/cub/util_debug.cuh @@ -82,12 +82,16 @@ __host__ __device__ __forceinline__ cudaError_t Debug( #ifdef CUB_STDERR if (error) { - #if (CUB_PTX_ARCH == 0) - fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); - #elif (CUB_PTX_ARCH >= 200) - printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); - #endif + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); + #endif + } } #endif return error; @@ -114,7 +118,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug( * \brief Log macro for printf statements. */ #if !defined(_CubLog) - #if !(defined(__clang__) && defined(__CUDA__)) + #if defined(__PGI_CUDA__) + #define _CubLog(format, ...) (__builtin_is_device_code() \ + ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + blockIdx.z, blockIdx.y, blockIdx.x, \ + threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \ + : printf(format,__VA_ARGS__)); + #elif !(defined(__clang__) && defined(__CUDA__)) #if (CUB_PTX_ARCH == 0) #define _CubLog(format, ...) printf(format,__VA_ARGS__); #elif (CUB_PTX_ARCH >= 200) diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 82b50312a5..b318a91399 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -208,40 +208,40 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver (void)(empty_kernel); -#if (CUB_PTX_ARCH == 0) // Host code. - - cudaError_t error = cudaSuccess; - cudaFuncAttributes empty_kernel_attrs; - - do { - // We do not `CubDebug` here because failure is not a hard error. - // We may be querying a device that we do not have code for but - // never use. - if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) { - // Clear the global CUDA error state which may have been set by - // the last call. Otherwise, errors may "leak" to unrelated - // kernel launches. - cudaGetLastError(); - break; - } - } - while(0); - - ptx_version = empty_kernel_attrs.ptxVersion * 10; - - return error; - -#else // Device code. - - // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. - // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 - (void)reinterpret_cast(empty_kernel); + cudaError_t result; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + result = cudaSuccess; + cudaFuncAttributes empty_kernel_attrs; + + do { + // We do not `CubDebug` here because failure is not a hard error. + // We may be querying a device that we do not have code for but + // never use. + if (result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) { + // Clear the global CUDA error state which may have been set by + // the last call. Otherwise, errors may "leak" to unrelated + // kernel launches. + cudaGetLastError(); + break; + } + } + while(0); - ptx_version = CUB_PTX_ARCH; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. + // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 + (void)reinterpret_cast(empty_kernel); - return cudaSuccess; + ptx_version = CUB_PTX_ARCH; -#endif + result = cudaSuccess; + #endif + } + return result; } /** @@ -291,16 +291,25 @@ __host__ __forceinline__ cudaError_t PtxVersion(int &ptx_version, int device) */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) { -#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11. - - return PtxVersion(ptx_version, CurrentDevice()); - -#else // Device code or host code before C++11. - - // Avoid an unnecessary set/reset of the CUDA current device. - return CubDebug(PtxVersionUncached(ptx_version)); - -#endif + cudaError_t result; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + #if __cplusplus >= 201103L + // Host code and C++11 + result = PtxVersion(ptx_version, CurrentDevice()); + #else + // Host code and C++98 + // Avoid an unnecessary set/reset of the CUDA current device. + result = CubDebug(PtxVersionUncached(ptx_version)); + #endif + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Avoid an unnecessary set/reset of the CUDA current device. + result = CubDebug(PtxVersionUncached(ptx_version)); + #endif + } + return result; } /** @@ -340,24 +349,32 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int &sm_versi */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device = CurrentDevice()) { -#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11. - - using FunctionPointer = cudaError_t(*)(int &, int); - FunctionPointer fun_ptr = SmVersionUncached; - - // C++11 guarantees that initialization of static locals is thread safe. - static const PerDeviceAttributeCache cache(fun_ptr); - - if (!CubDebug(cache.error[device])) - sm_version = cache.attribute[device]; - - return cache.error[device]; - -#else // Device code or host code before C++11. - - return SmVersionUncached(sm_version, device); - -#endif + cudaError_t result; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + #if __cplusplus >= 201103L + // Host code and C++11 + using FunctionPointer = cudaError_t(*)(int &, int); + FunctionPointer fun_ptr = SmVersionUncached; + + // C++11 guarantees that initialization of static locals is thread safe. + static const PerDeviceAttributeCache cache(fun_ptr); + + if (!CubDebug(cache.error[device])) + sm_version = cache.attribute[device]; + + result = cache.error[device]; + #else + // Host code and C++98 + result = SmVersionUncached(sm_version, device); + #endif + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + result = SmVersionUncached(sm_version, device); + #endif + } + return result; } /** @@ -365,23 +382,25 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) { -#if (CUB_PTX_ARCH == 0) // Host code. - - return CubDebug(cudaStreamSynchronize(stream)); - -#elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. - - (void)stream; - // Device can't yet sync on a specific stream - return CubDebug(cudaDeviceSynchronize()); - -#else // Device code without the CUDA runtime. - - (void)stream; - // CUDA API calls are not supported from this device. - return CubDebug(cudaErrorInvalidConfiguration); - -#endif + cudaError_t result; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + result = CubDebug(cudaStreamSynchronize(stream)); + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. + (void)stream; + // Device can't yet sync on a specific stream + result = CubDebug(cudaDeviceSynchronize()); + #else // Device code without the CUDA runtime. + (void)stream; + // CUDA API calls are not supported from this device. + result = CubDebug(cudaErrorInvalidConfiguration); + #endif + #endif + } + return result; } diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh index 582ca0d8b8..3f20c11beb 100644 --- a/cub/util_ptx.cuh +++ b/cub/util_ptx.cuh @@ -90,12 +90,8 @@ __device__ __forceinline__ unsigned int SHR_ADD( unsigned int addend) { unsigned int ret; -#if CUB_PTX_ARCH >= 200 asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x >> shift) + addend; -#endif return ret; } @@ -109,12 +105,8 @@ __device__ __forceinline__ unsigned int SHL_ADD( unsigned int addend) { unsigned int ret; -#if CUB_PTX_ARCH >= 200 asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x << shift) + addend; -#endif return ret; } @@ -131,12 +123,7 @@ __device__ __forceinline__ unsigned int BFE( Int2Type /*byte_len*/) { unsigned int bits; -#if CUB_PTX_ARCH >= 200 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); -#else - const unsigned int MASK = (1 << num_bits) - 1; - bits = (source >> bit_start) & MASK; -#endif return bits; } @@ -180,15 +167,8 @@ __device__ __forceinline__ void BFI( unsigned int bit_start, unsigned int num_bits) { -#if CUB_PTX_ARCH >= 200 asm ("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); -#else - x <<= bit_start; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - ret = (y & MASK_Y) | (x & MASK_X); -#endif } @@ -197,11 +177,7 @@ __device__ __forceinline__ void BFI( */ __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { -#if CUB_PTX_ARCH >= 200 asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); -#else - x = x + y + z; -#endif return x; }