diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh index d9740fb1cb..339b3d67b4 100644 --- a/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/device/dispatch/dispatch_histogram.cuh @@ -45,6 +45,8 @@ #include "../../grid/grid_queue.cuh" #include "../../config.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -455,41 +457,46 @@ struct DipatchHistogram int ptx_version, KernelConfig &histogram_sweep_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - return histogram_sweep_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 350) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 300) + cudaError_t result = cudaErrorNotSupported; + if (CUB_IS_DEVICE_CODE) { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 200) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 110) - { - return histogram_sweep_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + result = histogram_sweep_config.template Init(); + #endif } else { - // No global atomic support - return cudaErrorNotSupported; + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + result = histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + result = histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + result = cudaErrorNotSupported; + } + #endif } - - #endif + return result; } @@ -654,7 +661,10 @@ struct DipatchHistogram histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); // Invoke histogram_init_kernel - histogram_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + histogram_init_grid_dims, histogram_init_block_threads, 0, + stream + ).doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); @@ -669,7 +679,9 @@ struct DipatchHistogram histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); // Invoke histogram_sweep_kernel - histogram_sweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream + ).doit(histogram_sweep_kernel, d_samples, num_output_bins_wrapper, num_privatized_bins_wrapper, @@ -722,11 +734,7 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -830,11 +838,7 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -913,11 +917,7 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel dispatch configurations KernelConfig histogram_sweep_config; @@ -1025,11 +1025,7 @@ struct DipatchHistogram { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel dispatch configurations KernelConfig histogram_sweep_config; diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 1837ddc198..2b0919fa1c 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -47,6 +47,8 @@ #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -922,7 +924,9 @@ struct DispatchRadixSort : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); // Invoke upsweep_kernel with same grid size as downsweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_keys.Current(), d_keys.Alternate(), d_values.Current(), @@ -980,7 +984,10 @@ struct DispatchRadixSort : pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); // Invoke upsweep_kernel with same grid size as downsweep_kernel - pass_config.upsweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + pass_config.even_share.grid_size, + pass_config.upsweep_config.block_threads, 0, stream + ).doit(pass_config.upsweep_kernel, d_keys_in, d_spine, num_items, @@ -999,7 +1006,9 @@ struct DispatchRadixSort : 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); // Invoke scan_kernel - pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, pass_config.scan_config.block_threads, 0, stream + ).doit(pass_config.scan_kernel, d_spine, spine_length); @@ -1015,7 +1024,10 @@ struct DispatchRadixSort : pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); // Invoke downsweep_kernel - pass_config.downsweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + pass_config.even_share.grid_size, + pass_config.downsweep_config.block_threads, 0, stream + ).doit(pass_config.downsweep_kernel, d_keys_in, d_keys_out, d_values_in, @@ -1429,7 +1441,10 @@ struct DispatchSegmentedRadixSort : pass_bits); } - pass_config.segmented_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + num_segments, pass_config.segmented_config.block_threads, 0, + stream + ).doit(pass_config.segmented_kernel, d_keys_in, d_keys_out, d_values_in, d_values_out, d_begin_offsets, d_end_offsets, num_segments, diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 17092212c0..c9a5e4fbe0 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -46,6 +46,8 @@ #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -450,7 +452,9 @@ struct DispatchReduce : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); // Invoke single_reduce_sweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_in, d_out, num_items, @@ -544,7 +548,10 @@ struct DispatchReduce : reduce_config.sm_occupancy); // Invoke DeviceReduceKernel - reduce_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, + 0, stream + ).doit(reduce_kernel, d_in, d_block_reductions, num_items, @@ -564,7 +571,9 @@ struct DispatchReduce : ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); // Invoke DeviceReduceSingleTileKernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + thrust::cuda_cub::launcher::triple_chevron( + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream + ).doit(single_tile_kernel, d_block_reductions, d_out, reduce_grid_size, @@ -778,7 +787,10 @@ struct DispatchSegmentedReduce : segmented_reduce_config.sm_occupancy); // Invoke DeviceReduceKernel - segmented_reduce_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream + ).doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 9d826c928f..d8d8dcac41 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -44,6 +44,8 @@ #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -284,37 +286,40 @@ struct DispatchReduceByKey int ptx_version, KernelConfig &reduce_by_key_config) { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 300) + if (CUB_IS_DEVICE_CODE) { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_by_key_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + #endif } else { - reduce_by_key_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + #endif } - - #endif } @@ -428,7 +433,9 @@ struct DispatchReduceByKey if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles, d_num_runs_out); @@ -463,7 +470,10 @@ struct DispatchReduceByKey start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); // Invoke reduce_by_key_kernel - reduce_by_key_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, reduce_by_key_config.block_threads, 0, + stream + ).doit(reduce_by_key_kernel, d_keys_in, d_unique_out, d_values_in, @@ -513,11 +523,7 @@ struct DispatchReduceByKey { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel kernel dispatch configurations KernelConfig reduce_by_key_config; diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh index fa1a86f7cf..b68f166dec 100644 --- a/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/device/dispatch/dispatch_rle.cuh @@ -44,6 +44,8 @@ #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -274,36 +276,38 @@ struct DeviceRleDispatch int ptx_version, KernelConfig& device_rle_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - device_rle_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 300) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 200) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 130) - { - device_rle_config.template Init(); + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + #endif } else { - device_rle_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + #endif } - - #endif } @@ -415,7 +419,9 @@ struct DeviceRleDispatch if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors - device_scan_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); @@ -452,7 +458,9 @@ struct DeviceRleDispatch scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); // Invoke device_rle_sweep_kernel - device_rle_sweep_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, device_rle_config.block_threads, 0, stream + ).doit(device_rle_sweep_kernel, d_in, d_offsets_out, d_lengths_out, @@ -498,11 +506,7 @@ struct DeviceRleDispatch { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel kernel dispatch configurations KernelConfig device_rle_config; diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 667c631cd0..24b30f102c 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -44,6 +44,8 @@ #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -367,7 +369,9 @@ struct DispatchScan: if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles); @@ -398,7 +402,9 @@ struct DispatchScan: start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy); // Invoke scan_kernel - scan_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, Policy::BLOCK_THREADS, 0, stream + ).doit(scan_kernel, d_in, d_out, tile_state, diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index ad80fb69b4..5fec4cff72 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -44,6 +44,8 @@ #include "../../grid/grid_queue.cuh" #include "../../util_device.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -275,37 +277,39 @@ struct DispatchSelectIf int ptx_version, KernelConfig &select_if_config) { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_if_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - select_if_config.template Init(); - } - else if (ptx_version >= 300) - { - select_if_config.template Init(); - } - else if (ptx_version >= 200) - { - select_if_config.template Init(); - } - else if (ptx_version >= 130) - { - select_if_config.template Init(); + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)ptx_version; + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + #endif } else { - select_if_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + #endif } - - #endif } @@ -417,7 +421,9 @@ struct DispatchSelectIf if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); // Invoke scan_init_kernel to initialize tile descriptors - scan_init_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(scan_init_kernel, tile_status, num_tiles, d_num_selected_out); @@ -454,7 +460,9 @@ struct DispatchSelectIf scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); // Invoke select_if_kernel - select_if_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + scan_grid_size, select_if_config.block_threads, 0, stream + ).doit(select_if_kernel, d_in, d_flags, d_selected_out, @@ -501,11 +509,7 @@ struct DispatchSelectIf { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel kernel dispatch configurations KernelConfig select_if_config; diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh index 71d67c5f2e..4b7c5e0cf3 100644 --- a/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -47,6 +47,8 @@ #include "../../grid/grid_queue.cuh" #include "../../config.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -287,7 +289,7 @@ struct DispatchSpmv }; /// SM20 - struct Policy200 + struct Policy200 { typedef AgentSpmvPolicy< 96, @@ -314,7 +316,7 @@ struct DispatchSpmv /// SM30 - struct Policy300 + struct Policy300 { typedef AgentSpmvPolicy< 96, @@ -376,7 +378,7 @@ struct DispatchSpmv LOAD_LDG, LOAD_LDG, LOAD_LDG, - false, + false, BLOCK_SCAN_WARP_SCANS> SpmvPolicyT; @@ -488,53 +490,55 @@ struct DispatchSpmv KernelConfig &spmv_config, KernelConfig &segment_fixup_config) { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - spmv_config.template Init(); - segment_fixup_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) + if (CUB_IS_DEVICE_CODE) { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 500) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 370) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 350) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 300) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - - } - else if (ptx_version >= 200) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); + #if CUB_INCLUDE_DEVICE_CODE + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + #endif } else { - spmv_config.template Init(); - segment_fixup_config.template Init(); + #if CUB_INCLUDE_HOST_CODE + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + #endif } - - #endif } @@ -614,7 +618,10 @@ struct DispatchSpmv degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); // Invoke spmv_search_kernel - spmv_1col_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, + stream + ).doit(spmv_1col_kernel, spmv_params); // Check for failure to launch @@ -700,10 +707,13 @@ struct DispatchSpmv int search_block_size = INIT_KERNEL_THREADS; int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; -#if (CUB_PTX_ARCH == 0) - // Init textures - if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; -#endif + #if CUB_INCLUDE_HOST_CODE + if (CUB_IS_HOST_CODE) + { + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; + } + #endif if (search_grid_size < sm_count) // if (num_merge_tiles < spmv_sm_occupancy * sm_count) @@ -720,7 +730,9 @@ struct DispatchSpmv search_grid_size, search_block_size, (long long) stream); // Invoke spmv_search_kernel - spmv_search_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + search_grid_size, search_block_size, 0, stream + ).doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params); @@ -737,7 +749,9 @@ struct DispatchSpmv spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); // Invoke spmv_kernel - spmv_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + spmv_grid_size, spmv_config.block_threads, 0, stream + ).doit(spmv_kernel, spmv_params, d_tile_coordinates, d_tile_carry_pairs, @@ -759,7 +773,10 @@ struct DispatchSpmv segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); // Invoke segment_fixup_kernel - segment_fixup_kernel<<>>( + thrust::cuda_cub::launcher::triple_chevron( + segment_fixup_grid_size, segment_fixup_config.block_threads, + 0, stream + ).doit(segment_fixup_kernel, d_tile_carry_pairs, spmv_params.d_vector_y, num_merge_tiles, @@ -773,10 +790,13 @@ struct DispatchSpmv if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; } -#if (CUB_PTX_ARCH == 0) - // Free textures - if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; -#endif + #if CUB_INCLUDE_HOST_CODE + if (CUB_IS_HOST_CODE) + { + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; + } + #endif } while (0); @@ -802,11 +822,7 @@ struct DispatchSpmv { // Get PTX version int ptx_version = 0; - #if (CUB_PTX_ARCH == 0) if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif // Get kernel kernel dispatch configurations KernelConfig spmv_config, segment_fixup_config; diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh index 3d09e6e494..6b5f676b03 100644 --- a/cub/grid/grid_queue.cuh +++ b/cub/grid/grid_queue.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -123,43 +123,61 @@ public: OffsetT fill_size, cudaStream_t stream = 0) { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = fill_size; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - OffsetT counters[2]; - counters[FILL] = fill_size; - counters[DRAIN] = 0; - return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)stream; + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + result = cudaSuccess; + #endif + } else { + #if CUB_INCLUDE_HOST_CODE + OffsetT counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); + #endif + } + return result; } /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)stream; + d_counters[DRAIN] = 0; + result = cudaSuccess; + #endif + } else { + #if CUB_INCLUDE_HOST_CODE + result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); + #endif + } + return result; } /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)stream; + d_counters[FILL] = 0; + result = cudaSuccess; + #endif + } else { + #if CUB_INCLUDE_HOST_CODE + result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); + #endif + } + return result; } @@ -168,13 +186,19 @@ public: OffsetT &fill_size, cudaStream_t stream = 0) { -#if (CUB_PTX_ARCH > 0) - (void)stream; - fill_size = d_counters[FILL]; - return cudaSuccess; -#else - return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_DEVICE_CODE) { + #if CUB_INCLUDE_DEVICE_CODE + (void)stream; + fill_size = d_counters[FILL]; + result = cudaSuccess; + #endif + } else { + #if CUB_INCLUDE_HOST_CODE + result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); + #endif + } + return result; } diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh index 60c7aa0adc..aae069edfa 100644 --- a/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/iterator/tex_obj_input_iterator.cuh @@ -204,24 +204,32 @@ public: /// Indirection __host__ __device__ __forceinline__ reference operator*() const { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Move array of uninitialized words, then alias and assign to return value - TextureWord words[TEXTURE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch( - tex_obj, - (tex_offset * TEXTURE_MULTIPLE) + i); + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + // Simply dereference the pointer on the host + return ptr[tex_offset]; + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); + #else + // This is dead code which will never be executed. It is here + // only to avoid warnings about missing return statements. + return ptr[tex_offset]; + #endif } - - // Load from words - return *reinterpret_cast(words); -#endif } /// Addition diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh index 947aff4e23..e63650aae3 100644 --- a/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/iterator/tex_ref_input_iterator.cuh @@ -279,13 +279,19 @@ public: /// Indirection __host__ __device__ __forceinline__ reference operator*() const { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Use the texture reference - return TexId::Fetch(tex_offset); -#endif + if (CUB_IS_HOST_CODE) { + // Simply dereference the pointer on the host + return ptr[tex_offset]; + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Use the texture reference + return TexId::Fetch(tex_offset); + #else + // This is dead code that will never be executed. It is here + // only to avoid warnings about missing returns. + return ptr[tex_offset]; + #endif + } } /// Addition diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh index fb2df4fde2..00ce46d265 100644 --- a/cub/util_arch.cuh +++ b/cub/util_arch.cuh @@ -44,19 +44,44 @@ namespace cub { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) +#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \ + !defined(CUB_USE_COOPERATIVE_GROUPS) #define CUB_USE_COOPERATIVE_GROUPS #endif -/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). +/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are +/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. #ifndef CUB_PTX_ARCH - #ifndef __CUDA_ARCH__ + #if defined(__NVCOMPILER_CUDA__) + // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined + // when compiling both host code and device code. Currently, only one + // PTX version can be targeted. + #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ + #elif !defined(__CUDA_ARCH__) #define CUB_PTX_ARCH 0 #else #define CUB_PTX_ARCH __CUDA_ARCH__ #endif #endif +#ifndef CUB_IS_DEVICE_CODE + #if defined(__NVCOMPILER_CUDA__) + #define CUB_IS_DEVICE_CODE __builtin_is_device_code() + #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 1 + #elif CUB_PTX_ARCH > 0 + #define CUB_IS_DEVICE_CODE 1 + #define CUB_IS_HOST_CODE 0 + #define CUB_INCLUDE_DEVICE_CODE 1 + #define CUB_INCLUDE_HOST_CODE 0 + #else + #define CUB_IS_DEVICE_CODE 0 + #define CUB_IS_HOST_CODE 1 + #define CUB_INCLUDE_DEVICE_CODE 0 + #define CUB_INCLUDE_HOST_CODE 1 + #endif +#endif /// Maximum number of devices supported. #ifndef CUB_MAX_DEVICES diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh index acbc1df10e..354eab6cb6 100644 --- a/cub/util_debug.cuh +++ b/cub/util_debug.cuh @@ -82,12 +82,16 @@ __host__ __device__ __forceinline__ cudaError_t Debug( #ifdef CUB_STDERR if (error) { - #if (CUB_PTX_ARCH == 0) - fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); - #elif (CUB_PTX_ARCH >= 200) - printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); - #endif + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); + #endif + } } #endif return error; @@ -114,7 +118,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug( * \brief Log macro for printf statements. */ #if !defined(_CubLog) - #if !(defined(__clang__) && defined(__CUDA__)) + #if defined(__NVCOMPILER_CUDA__) + #define _CubLog(format, ...) (__builtin_is_device_code() \ + ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ + blockIdx.z, blockIdx.y, blockIdx.x, \ + threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \ + : printf(format, __VA_ARGS__)); + #elif !(defined(__clang__) && defined(__CUDA__)) #if (CUB_PTX_ARCH == 0) #define _CubLog(format, ...) printf(format,__VA_ARGS__); #elif (CUB_PTX_ARCH >= 200) diff --git a/cub/util_device.cuh b/cub/util_device.cuh index bfef8b7a76..0cbdca00b2 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -348,32 +348,29 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_ver (void)(empty_kernel); -#if (CUB_PTX_ARCH == 0) // Host code. - - cudaError_t error = cudaSuccess; - cudaFuncAttributes empty_kernel_attrs; - - do { - if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel))) - break; + cudaError_t result = cudaSuccess; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + cudaFuncAttributes empty_kernel_attrs; + + do { + if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel))) + break; + } + while(0); + + ptx_version = empty_kernel_attrs.ptxVersion * 10; + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. + // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 + (void)reinterpret_cast(empty_kernel); + + ptx_version = CUB_PTX_ARCH; + #endif } - while (0); - - ptx_version = empty_kernel_attrs.ptxVersion * 10; - - return error; - -#else // Device code. - - // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. - // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 - (void)reinterpret_cast(empty_kernel); - - ptx_version = CUB_PTX_ARCH; - - return cudaSuccess; - -#endif + return result; } /** @@ -436,26 +433,35 @@ __host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device) */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version) { -#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11. - - auto const device = CurrentDevice(); - - auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. - [=] (int& pv) { return PtxVersionUncached(pv, device); }, - device); - - if (!CubDebug(payload.error)) - ptx_version = payload.attribute; - - return payload.error; - -#else // Device code or host code before C++11. - - return PtxVersionUncached(ptx_version); - -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + #if __cplusplus >= 201103L + // Host code and C++11. + auto const device = CurrentDevice(); + + auto const payload = GetPerDeviceAttributeCache()( + // If this call fails, then we get the error code back in the payload, + // which we check with `CubDebug` below. + [=] (int& pv) { return PtxVersionUncached(pv, device); }, + device); + + if (!CubDebug(payload.error)) + ptx_version = payload.attribute; + + result = payload.error; + #else + // Host code and C++98. + result = PtxVersionUncached(ptx_version); + #endif + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + // Device code. + result = PtxVersionUncached(ptx_version); + #endif + } + return result; } /** @@ -497,24 +503,32 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_versi */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) { -#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11. - - auto const payload = GetPerDeviceAttributeCache()( - // If this call fails, then we get the error code back in the payload, - // which we check with `CubDebug` below. - [=] (int& pv) { return SmVersionUncached(pv, device); }, - device); - - if (!CubDebug(payload.error)) - sm_version = payload.attribute; - - return payload.error; - -#else // Device code or host code before C++11. - - return SmVersionUncached(sm_version, device); - -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + #if __cplusplus >= 201103L + // Host code and C++11 + auto const payload = GetPerDeviceAttributeCache()( + // If this call fails, then we get the error code back in the payload, + // which we check with `CubDebug` below. + [=] (int& pv) { return SmVersionUncached(pv, device); }, + device); + + if (!CubDebug(payload.error)) + sm_version = payload.attribute; + + result = payload.error; + #else + // Host code and C++98 + result = SmVersionUncached(sm_version, device); + #endif + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + result = SmVersionUncached(sm_version, device); + #endif + } + return result; } /** @@ -522,23 +536,25 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int */ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) { -#if (CUB_PTX_ARCH == 0) // Host code. - - return CubDebug(cudaStreamSynchronize(stream)); - -#elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. - - (void)stream; - // Device can't yet sync on a specific stream - return CubDebug(cudaDeviceSynchronize()); - -#else // Device code without the CUDA runtime. - - (void)stream; - // CUDA API calls are not supported from this device. - return CubDebug(cudaErrorInvalidConfiguration); - -#endif + cudaError_t result = cudaErrorUnknown; + if (CUB_IS_HOST_CODE) { + #if CUB_INCLUDE_HOST_CODE + result = CubDebug(cudaStreamSynchronize(stream)); + #endif + } else { + #if CUB_INCLUDE_DEVICE_CODE + #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. + (void)stream; + // Device can't yet sync on a specific stream + result = CubDebug(cudaDeviceSynchronize()); + #else // Device code without the CUDA runtime. + (void)stream; + // CUDA API calls are not supported from this device. + result = CubDebug(cudaErrorInvalidConfiguration); + #endif + #endif + } + return result; } diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh index 582ca0d8b8..3f20c11beb 100644 --- a/cub/util_ptx.cuh +++ b/cub/util_ptx.cuh @@ -90,12 +90,8 @@ __device__ __forceinline__ unsigned int SHR_ADD( unsigned int addend) { unsigned int ret; -#if CUB_PTX_ARCH >= 200 asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x >> shift) + addend; -#endif return ret; } @@ -109,12 +105,8 @@ __device__ __forceinline__ unsigned int SHL_ADD( unsigned int addend) { unsigned int ret; -#if CUB_PTX_ARCH >= 200 asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x << shift) + addend; -#endif return ret; } @@ -131,12 +123,7 @@ __device__ __forceinline__ unsigned int BFE( Int2Type /*byte_len*/) { unsigned int bits; -#if CUB_PTX_ARCH >= 200 asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); -#else - const unsigned int MASK = (1 << num_bits) - 1; - bits = (source >> bit_start) & MASK; -#endif return bits; } @@ -180,15 +167,8 @@ __device__ __forceinline__ void BFI( unsigned int bit_start, unsigned int num_bits) { -#if CUB_PTX_ARCH >= 200 asm ("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); -#else - x <<= bit_start; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - ret = (y & MASK_Y) | (x & MASK_X); -#endif } @@ -197,11 +177,7 @@ __device__ __forceinline__ void BFI( */ __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { -#if CUB_PTX_ARCH >= 200 asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); -#else - x = x + y + z; -#endif return x; }