From 1caaac18483cfae068ad8e4aed1780a27c6be0b9 Mon Sep 17 00:00:00 2001
From: David Olsen <dolsen@nvidia.com>
Date: Mon, 6 Apr 2020 15:32:17 -0700
Subject: [PATCH] Changes necessary to support Feta.

Reviewed-by: Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
---
 cub/device/device_reduce.cuh                  |   4 +-
 cub/device/dispatch/dispatch_histogram.cuh    | 116 ++++++------
 cub/device/dispatch/dispatch_radix_sort.cuh   |  25 ++-
 cub/device/dispatch/dispatch_reduce.cuh       |  20 +-
 .../dispatch/dispatch_reduce_by_key.cuh       |  76 ++++----
 cub/device/dispatch/dispatch_rle.cuh          |  78 ++++----
 cub/device/dispatch/dispatch_scan.cuh         |  81 +++++----
 cub/device/dispatch/dispatch_select_if.cuh    |  76 ++++----
 cub/device/dispatch/dispatch_spmv_orig.cuh    | 138 ++++++++------
 cub/iterator/tex_obj_input_iterator.cuh       |  42 +++--
 cub/iterator/tex_ref_input_iterator.cuh       |  20 +-
 cub/util_arch.cuh                             |  27 ++-
 cub/util_debug.cuh                            |  24 ++-
 cub/util_device.cuh                           | 171 ++++++++++--------
 cub/util_ptx.cuh                              |  24 ---
 15 files changed, 528 insertions(+), 394 deletions(-)
diff --git a/cub/device/device_reduce.cuh b/cub/device/device_reduce.cuh
index 13c7a72d1a..a3b6ca1497 100644
--- a/cub/device/device_reduce.cuh
+++ b/cub/device/device_reduce.cuh
@@ -150,14 +150,14 @@ struct DeviceReduce
         size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
         InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
         OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        typename thrust::iterator_traits<InputIteratorT>::difference_type num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
         ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
         T                           init,                               ///< [in] Initial value of the reduction
         cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
         bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
     {
         // Signed integer type for global offsets
-        typedef int OffsetT;
+        typedef typename thrust::iterator_traits<InputIteratorT>::difference_type OffsetT;
 
         return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
             d_temp_storage,
diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
index 059c881228..76461935a9 100644
--- a/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/device/dispatch/dispatch_histogram.cuh
@@ -45,6 +45,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -455,41 +457,46 @@ struct DipatchHistogram
         int             ptx_version,
         KernelConfig    &histogram_sweep_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 500)
-        {
-            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 350)
-        {
-            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
         {
-            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 110)
-        {
-            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
         }
         else
         {
-            // No global atomic support
-            return cudaErrorNotSupported;
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 110)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    // No global atomic support
+                    result = cudaErrorNotSupported;
+                }
+            #endif
         }
-
-    #endif
+	return result;
     }
 
 
@@ -654,7 +661,10 @@ struct DipatchHistogram
                 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
 
             // Invoke histogram_init_kernel
-            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
                 num_output_bins_wrapper,
                 d_output_histograms_wrapper,
                 tile_queue);
@@ -669,7 +679,9 @@ struct DipatchHistogram
                 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
 
             // Invoke histogram_sweep_kernel
-            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
                 d_samples,
                 num_output_bins_wrapper,
                 num_privatized_bins_wrapper,
@@ -722,11 +734,11 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -830,11 +842,11 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -913,11 +925,11 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -1025,11 +1037,11 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 60ba9fe8ff..cb6ceebbd9 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -47,6 +47,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -921,7 +923,9 @@ struct DispatchRadixSort :
                     ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
 
             // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_keys.Current(),
                 d_keys.Alternate(),
                 d_values.Current(),
@@ -979,7 +983,10 @@ struct DispatchRadixSort :
                 pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
 
             // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
                 d_keys_in,
                 d_spine,
                 num_items,
@@ -998,7 +1005,9 @@ struct DispatchRadixSort :
                 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
 
             // Invoke scan_kernel
-            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
                 d_spine,
                 spine_length);
 
@@ -1014,7 +1023,10 @@ struct DispatchRadixSort :
                 pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
 
             // Invoke downsweep_kernel
-            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
                 d_keys_in,
                 d_keys_out,
                 d_values_in,
@@ -1418,7 +1430,10 @@ struct DispatchSegmentedRadixSort :
                     num_segments, pass_config.segmented_config.block_threads, (long long) stream,
                 pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
 
-            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
                 d_keys_in, d_keys_out,
                 d_values_in,  d_values_out,
                 d_begin_offsets, d_end_offsets, num_segments,
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 14557c6087..1cb8dbdd1c 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -46,6 +46,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -449,7 +451,9 @@ struct DispatchReduce :
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
             // Invoke single_reduce_sweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_in,
                 d_out,
                 num_items,
@@ -543,7 +547,10 @@ struct DispatchReduce :
                 reduce_config.sm_occupancy);
 
             // Invoke DeviceReduceKernel
-            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
                 d_in,
                 d_block_reductions,
                 num_items,
@@ -563,7 +570,9 @@ struct DispatchReduce :
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
             // Invoke DeviceReduceSingleTileKernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_block_reductions,
                 d_out,
                 reduce_grid_size,
@@ -776,7 +785,10 @@ struct DispatchSegmentedReduce :
                 segmented_reduce_config.sm_occupancy);
 
             // Invoke DeviceReduceKernel
-            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(segmented_reduce_kernel,
                 d_in,
                 d_out,
                 d_begin_offsets,
diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
index a494925c7c..6d0b425e9f 100644
--- a/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -44,6 +44,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -284,37 +286,40 @@ struct DispatchReduceByKey
         int             ptx_version,
         KernelConfig    &reduce_by_key_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 300)
+        if (CUB_IS_DEVICE_CODE)
         {
-            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
         }
         else
         {
-            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+                }
+                else
+                {
+                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -428,7 +433,9 @@ struct DispatchReduceByKey
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thurst::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
                 tile_state,
                 num_tiles,
                 d_num_runs_out);
@@ -463,7 +470,10 @@ struct DispatchReduceByKey
                     start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
 
                 // Invoke reduce_by_key_kernel
-                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
+        	thurst::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
                     d_keys_in,
                     d_unique_out,
                     d_values_in,
@@ -513,11 +523,11 @@ struct DispatchReduceByKey
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel kernel dispatch configurations
             KernelConfig reduce_by_key_config;
diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
index 0a094f6c6a..16cddef6ce 100644
--- a/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/device/dispatch/dispatch_rle.cuh
@@ -44,6 +44,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -274,36 +276,36 @@ struct DeviceRleDispatch
         int             ptx_version,
         KernelConfig&   device_rle_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_rle_config.template Init<PtxRleSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+                }
+                else
+                {
+                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+                }
+            #endif
         }
-        else if (ptx_version >= 200)
-        {
-            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
-        }
-        else
-        {
-            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
-        }
-
-    #endif
     }
 
 
@@ -415,7 +417,9 @@ struct DeviceRleDispatch
             if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
                 tile_status,
                 num_tiles,
                 d_num_runs_out);
@@ -452,7 +456,9 @@ struct DeviceRleDispatch
                 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
 
             // Invoke device_rle_sweep_kernel
-            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
                 d_in,
                 d_offsets_out,
                 d_lengths_out,
@@ -498,11 +504,11 @@ struct DeviceRleDispatch
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel kernel dispatch configurations
             KernelConfig device_rle_config;
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index dc5bbaadcb..d86d98257e 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -45,6 +45,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -311,45 +313,48 @@ struct DispatchScan
         int             ptx_version,
         KernelConfig    &scan_kernel_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        scan_kernel_config.template Init<PtxAgentScanPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
-        }
-        else if (ptx_version >= 520)
-        {
-            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
-        }
-        else if (ptx_version >= 350)
+        if (CUB_IS_DEVICE_CODE)
         {
-            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                scan_kernel_config.template Init<PtxAgentScanPolicy>();
+            #endif
         }
         else
         {
-            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
+                }
+                else if (ptx_version >= 520)
+                {
+                    scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+                }
+                else
+                {
+                    scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -460,7 +465,9 @@ struct DispatchScan
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
                 tile_state,
                 num_tiles);
 
@@ -490,7 +497,9 @@ struct DispatchScan
                     start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
 
                 // Invoke scan_kernel
-                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, scan_kernel_config.block_threads, 0, stream
+                ).doit(scan_kernel,
                     d_in,
                     d_out,
                     tile_state,
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index 0fd622a35e..f42eed804d 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -44,6 +44,8 @@
 #include "../../util_device.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -275,37 +277,39 @@ struct DispatchSelectIf
         int             ptx_version,
         KernelConfig    &select_if_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        select_if_config.template Init<PtxSelectIfPolicyT>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
         }
         else
         {
-            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+                }
+                else
+                {
+                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -417,7 +421,9 @@ struct DispatchSelectIf
             if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke scan_init_kernel to initialize tile descriptors
-            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
                 tile_status,
                 num_tiles,
                 d_num_selected_out);
@@ -454,7 +460,9 @@ struct DispatchSelectIf
                 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
 
             // Invoke select_if_kernel
-            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
                 d_in,
                 d_flags,
                 d_selected_out,
@@ -501,11 +509,11 @@ struct DispatchSelectIf
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel kernel dispatch configurations
             KernelConfig select_if_config;
diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh
index b43190980d..85acde4c18 100644
--- a/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -47,6 +47,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../util_namespace.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -488,53 +490,55 @@ struct DispatchSpmv
         KernelConfig    &spmv_config,
         KernelConfig    &segment_fixup_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        spmv_config.template Init<PtxSpmvPolicyT>();
-        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 500)
-        {
-            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 370)
-        {
-            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 350)
+        if (CUB_IS_DEVICE_CODE)
         {
-            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
-
-        }
-        else if (ptx_version >= 200)
-        {
-            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
         }
         else
         {
-            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -614,7 +618,10 @@ struct DispatchSpmv
                     degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
 
                 // Invoke spmv_search_kernel
-                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
                     spmv_params);
 
                 // Check for failure to launch
@@ -700,10 +707,13 @@ struct DispatchSpmv
             int search_block_size   = INIT_KERNEL_THREADS;
             int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
 
-#if (CUB_PTX_ARCH == 0)
-            // Init textures
-            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
-#endif
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Init textures
+                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+                }
+            #endif
 
             if (search_grid_size < sm_count)
 //            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
@@ -720,7 +730,9 @@ struct DispatchSpmv
                     search_grid_size, search_block_size, (long long) stream);
 
                 // Invoke spmv_search_kernel
-                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
                     num_merge_tiles,
                     d_tile_coordinates,
                     spmv_params);
@@ -737,7 +749,9 @@ struct DispatchSpmv
                 spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
 
             // Invoke spmv_kernel
-            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
                 spmv_params,
                 d_tile_coordinates,
                 d_tile_carry_pairs,
@@ -759,7 +773,10 @@ struct DispatchSpmv
                     segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
 
                 // Invoke segment_fixup_kernel
-                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
                     d_tile_carry_pairs,
                     spmv_params.d_vector_y,
                     num_merge_tiles,
@@ -773,10 +790,13 @@ struct DispatchSpmv
                 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
             }
 
-#if (CUB_PTX_ARCH == 0)
-            // Free textures
-            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
-#endif
+            #if CUB_INCLUDE_HOST_CODE
+	        if (CUB_IS_HOST_CODE)
+	        {
+                    // Free textures
+                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+                }
+            #endif
         }
         while (0);
 
@@ -802,11 +822,11 @@ struct DispatchSpmv
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
+            if (CUB_IS_HOST_CODE) {
+                if (CubDebug(error = PtxVersion(ptx_version))) break;
+            } else {
+                ptx_version = CUB_PTX_ARCH;
+            }
 
             // Get kernel kernel dispatch configurations
             KernelConfig spmv_config, segment_fixup_config;
diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh
index aed3c7e779..d7da4b6925 100644
--- a/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/iterator/tex_obj_input_iterator.cuh
@@ -204,24 +204,32 @@ public:
     /// Indirection
     __host__ __device__ __forceinline__ reference operator*() const
     {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed. It is here
+                // only to avoid warnings about missing return statements.
+                return *ptr;
+            #endif
         }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
     }
 
     /// Addition
diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh
index 1479e41a9d..9e0c6204a4 100644
--- a/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/iterator/tex_ref_input_iterator.cuh
@@ -279,13 +279,19 @@ public:
     /// Indirection
     __host__ __device__ __forceinline__ reference operator*() const
     {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Use the texture reference
-        return TexId::Fetch(tex_offset);
-#endif
+        if (CUB_IS_HOST_CODE) {
+            // Simply dereference the pointer on the host
+            return ptr[tex_offset];
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Use the texture reference
+                return TexId::Fetch(tex_offset);
+            #else
+                // This is dead code that will never be executed. It is here
+                // only to avoid warnings about missing returns.
+                return *ptr;
+            #endif
+        }
     }
 
     /// Addition
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index fb2df4fde2..d250457305 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -44,19 +44,42 @@ namespace cub {
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__PGI_CUDA__)) && \
+        !defined(CUB_USE_COOPERATIVE_GROUPS)
     #define CUB_USE_COOPERATIVE_GROUPS
 #endif
 
 /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 #ifndef CUB_PTX_ARCH
-    #ifndef __CUDA_ARCH__
+    #if defined(__PGI_CUDA__)
+	// __PGI_CUDA_ARCH__ is the minimum supported SM target.  It is defined
+	// with the same value when compiling both host code and device code.
+        #define CUB_PTX_ARCH __PGI_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
         #define CUB_PTX_ARCH 0
     #else
         #define CUB_PTX_ARCH __CUDA_ARCH__
     #endif
 #endif
 
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(__PGI_CUDA__)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
 
 /// Maximum number of devices supported.
 #ifndef CUB_MAX_DEVICES
diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
index acbc1df10e..c79b4f0953 100644
--- a/cub/util_debug.cuh
+++ b/cub/util_debug.cuh
@@ -82,12 +82,16 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
 #ifdef CUB_STDERR
     if (error)
     {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-    #endif
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
     }
 #endif
     return error;
@@ -114,7 +118,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
  * \brief Log macro for printf statements.
  */
 #if !defined(_CubLog)
-    #if !(defined(__clang__) && defined(__CUDA__))
+    #if defined(__PGI_CUDA__)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format,__VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
         #if (CUB_PTX_ARCH == 0)
             #define _CubLog(format, ...) printf(format,__VA_ARGS__);
         #elif (CUB_PTX_ARCH >= 200)
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index 82b50312a5..b318a91399 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -208,40 +208,40 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int &ptx_ver
 
     (void)(empty_kernel);
 
-#if (CUB_PTX_ARCH == 0) // Host code.
-
-    cudaError_t error = cudaSuccess;
-    cudaFuncAttributes empty_kernel_attrs;
-
-    do {
-        // We do not `CubDebug` here because failure is not a hard error.
-        // We may be querying a device that we do not have code for but
-        // never use.
-        if (error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) {
-            // Clear the global CUDA error state which may have been set by
-            // the last call. Otherwise, errors may "leak" to unrelated
-            // kernel launches.
-            cudaGetLastError();
-            break;
-        }
-    }
-    while(0);
-
-    ptx_version = empty_kernel_attrs.ptxVersion * 10;
-
-    return error;
-
-#else // Device code.
-
-    // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
-    // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
-    (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+    cudaError_t result;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = cudaSuccess;
+            cudaFuncAttributes empty_kernel_attrs;
+
+            do {
+                // We do not `CubDebug` here because failure is not a hard error.
+                // We may be querying a device that we do not have code for but
+                // never use.
+                if (result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)) {
+                    // Clear the global CUDA error state which may have been set by
+                    // the last call. Otherwise, errors may "leak" to unrelated
+                    // kernel launches.
+                    cudaGetLastError();
+                    break;
+                }
+            }
+            while(0);
 
-    ptx_version = CUB_PTX_ARCH;
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
 
-    return cudaSuccess;
+            ptx_version = CUB_PTX_ARCH;
 
-#endif
+            result = cudaSuccess;
+        #endif
+    }
+    return result;
 }
 
 /**
@@ -291,16 +291,25 @@ __host__ __forceinline__ cudaError_t PtxVersion(int &ptx_version, int device)
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
 {
-#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11.
-
-    return PtxVersion(ptx_version, CurrentDevice());
-
-#else // Device code or host code before C++11.
-
-    // Avoid an unnecessary set/reset of the CUDA current device.
-    return CubDebug(PtxVersionUncached(ptx_version));
-
-#endif
+    cudaError_t result;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if __cplusplus >= 201103L
+                // Host code and C++11
+                result = PtxVersion(ptx_version, CurrentDevice());
+            #else
+                // Host code and C++98
+                // Avoid an unnecessary set/reset of the CUDA current device.
+                result = CubDebug(PtxVersionUncached(ptx_version));
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Avoid an unnecessary set/reset of the CUDA current device.
+            result = CubDebug(PtxVersionUncached(ptx_version));
+        #endif
+    }
+    return result;
 }
 
 /**
@@ -340,24 +349,32 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int &sm_versi
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device = CurrentDevice())
 {
-#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11.
-
-    using FunctionPointer = cudaError_t(*)(int &, int);
-    FunctionPointer fun_ptr = SmVersionUncached;
-
-    // C++11 guarantees that initialization of static locals is thread safe.
-    static const PerDeviceAttributeCache cache(fun_ptr);
-
-    if (!CubDebug(cache.error[device]))
-        sm_version = cache.attribute[device];
-
-    return cache.error[device];
-
-#else // Device code or host code before C++11.
-
-    return SmVersionUncached(sm_version, device);
-
-#endif
+    cudaError_t result;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if __cplusplus >= 201103L
+                // Host code and C++11
+                using FunctionPointer = cudaError_t(*)(int &, int);
+                FunctionPointer fun_ptr = SmVersionUncached;
+
+                // C++11 guarantees that initialization of static locals is thread safe.
+                static const PerDeviceAttributeCache cache(fun_ptr);
+
+                if (!CubDebug(cache.error[device]))
+                    sm_version = cache.attribute[device];
+
+                result = cache.error[device];
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
 }
 
 /**
@@ -365,23 +382,25 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
 {
-#if (CUB_PTX_ARCH == 0) // Host code.
-
-    return CubDebug(cudaStreamSynchronize(stream));
-
-#elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
-
-    (void)stream;
-    // Device can't yet sync on a specific stream
-    return CubDebug(cudaDeviceSynchronize());
-
-#else // Device code without the CUDA runtime.
-
-    (void)stream;
-    // CUDA API calls are not supported from this device.
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#endif
+    cudaError_t result;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cudaDeviceSynchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
 }
 
 
diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh
index 582ca0d8b8..3f20c11beb 100644
--- a/cub/util_ptx.cuh
+++ b/cub/util_ptx.cuh
@@ -90,12 +90,8 @@ __device__ __forceinline__ unsigned int SHR_ADD(
     unsigned int addend)
 {
     unsigned int ret;
-#if CUB_PTX_ARCH >= 200
     asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
         "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
     return ret;
 }
 
@@ -109,12 +105,8 @@ __device__ __forceinline__ unsigned int SHL_ADD(
     unsigned int addend)
 {
     unsigned int ret;
-#if CUB_PTX_ARCH >= 200
     asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
         "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
     return ret;
 }
 
@@ -131,12 +123,7 @@ __device__ __forceinline__ unsigned int BFE(
     Int2Type<BYTE_LEN>      /*byte_len*/)
 {
     unsigned int bits;
-#if CUB_PTX_ARCH >= 200
     asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
     return bits;
 }
 
@@ -180,15 +167,8 @@ __device__ __forceinline__ void BFI(
     unsigned int bit_start,
     unsigned int num_bits)
 {
-#if CUB_PTX_ARCH >= 200
     asm ("bfi.b32 %0, %1, %2, %3, %4;" :
         "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    x <<= bit_start;
-    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
-    unsigned int MASK_Y = ~MASK_X;
-    ret = (y & MASK_Y) | (x & MASK_X);
-#endif
 }
 
 
@@ -197,11 +177,7 @@ __device__ __forceinline__ void BFI(
  */
 __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
 {
-#if CUB_PTX_ARCH >= 200
     asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
     return x;
 }