diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
index d9740fb1cb..339b3d67b4 100644
--- a/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/device/dispatch/dispatch_histogram.cuh
@@ -45,6 +45,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../config.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -455,41 +457,46 @@ struct DipatchHistogram
         int             ptx_version,
         KernelConfig    &histogram_sweep_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 500)
-        {
-            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 350)
-        {
-            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
+        cudaError_t result = cudaErrorNotSupported;
+        if (CUB_IS_DEVICE_CODE)
         {
-            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 110)
-        {
-            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+            #endif
         }
         else
         {
-            // No global atomic support
-            return cudaErrorNotSupported;
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 500)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+                }
+                else if (ptx_version >= 110)
+                {
+                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+                }
+                else
+                {
+                    // No global atomic support
+                    result = cudaErrorNotSupported;
+                }
+            #endif
         }
-
-    #endif
+        return result;
     }
 
 
@@ -654,7 +661,10 @@ struct DipatchHistogram
                 histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
 
             // Invoke histogram_init_kernel
-            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                histogram_init_grid_dims, histogram_init_block_threads, 0,
+                stream
+            ).doit(histogram_init_kernel,
                 num_output_bins_wrapper,
                 d_output_histograms_wrapper,
                 tile_queue);
@@ -669,7 +679,9 @@ struct DipatchHistogram
                 histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
 
             // Invoke histogram_sweep_kernel
-            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
+            ).doit(histogram_sweep_kernel,
                 d_samples,
                 num_output_bins_wrapper,
                 num_privatized_bins_wrapper,
@@ -722,11 +734,7 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -830,11 +838,7 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -913,11 +917,7 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
@@ -1025,11 +1025,7 @@ struct DipatchHistogram
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel dispatch configurations
             KernelConfig histogram_sweep_config;
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 1837ddc198..2b0919fa1c 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -47,6 +47,8 @@
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -922,7 +924,9 @@ struct DispatchRadixSort :
                     ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
 
             // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_keys.Current(),
                 d_keys.Alternate(),
                 d_values.Current(),
@@ -980,7 +984,10 @@ struct DispatchRadixSort :
                 pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
 
             // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.upsweep_config.block_threads, 0, stream
+            ).doit(pass_config.upsweep_kernel,
                 d_keys_in,
                 d_spine,
                 num_items,
@@ -999,7 +1006,9 @@ struct DispatchRadixSort :
                 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
 
             // Invoke scan_kernel
-            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, pass_config.scan_config.block_threads, 0, stream
+            ).doit(pass_config.scan_kernel,
                 d_spine,
                 spine_length);
 
@@ -1015,7 +1024,10 @@ struct DispatchRadixSort :
                 pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
 
             // Invoke downsweep_kernel
-            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                pass_config.even_share.grid_size,
+                pass_config.downsweep_config.block_threads, 0, stream
+            ).doit(pass_config.downsweep_kernel,
                 d_keys_in,
                 d_keys_out,
                 d_values_in,
@@ -1429,7 +1441,10 @@ struct DispatchSegmentedRadixSort :
                       pass_bits);
             }
 
-            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments, pass_config.segmented_config.block_threads, 0,
+                stream
+            ).doit(pass_config.segmented_kernel,
                 d_keys_in, d_keys_out,
                 d_values_in,  d_values_out,
                 d_begin_offsets, d_end_offsets, num_segments,
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 17092212c0..c9a5e4fbe0 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -46,6 +46,8 @@
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -450,7 +452,9 @@ struct DispatchReduce :
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
             // Invoke single_reduce_sweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_in,
                 d_out,
                 num_items,
@@ -544,7 +548,10 @@ struct DispatchReduce :
                 reduce_config.sm_occupancy);
 
             // Invoke DeviceReduceKernel
-            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                0, stream
+            ).doit(reduce_kernel,
                 d_in,
                 d_block_reductions,
                 num_items,
@@ -564,7 +571,9 @@ struct DispatchReduce :
                 ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
 
             // Invoke DeviceReduceSingleTileKernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
+            ).doit(single_tile_kernel,
                 d_block_reductions,
                 d_out,
                 reduce_grid_size,
@@ -778,7 +787,10 @@ struct DispatchSegmentedReduce :
                 segmented_reduce_config.sm_occupancy);
 
             // Invoke DeviceReduceKernel
-            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
+            ).doit(segmented_reduce_kernel,
                 d_in,
                 d_out,
                 d_begin_offsets,
diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 9d826c928f..d8d8dcac41 100644
--- a/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -44,6 +44,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -284,37 +286,40 @@ struct DispatchReduceByKey
         int             ptx_version,
         KernelConfig    &reduce_by_key_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 300)
+        if (CUB_IS_DEVICE_CODE)
         {
-            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+            #endif
         }
         else
         {
-            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+                }
+                else
+                {
+                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -428,7 +433,9 @@ struct DispatchReduceByKey
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
                 tile_state,
                 num_tiles,
                 d_num_runs_out);
@@ -463,7 +470,10 @@ struct DispatchReduceByKey
                     start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
 
                 // Invoke reduce_by_key_kernel
-                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, reduce_by_key_config.block_threads, 0,
+                    stream
+                ).doit(reduce_by_key_kernel,
                     d_keys_in,
                     d_unique_out,
                     d_values_in,
@@ -513,11 +523,7 @@ struct DispatchReduceByKey
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel kernel dispatch configurations
             KernelConfig reduce_by_key_config;
diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
index fa1a86f7cf..b68f166dec 100644
--- a/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/device/dispatch/dispatch_rle.cuh
@@ -44,6 +44,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -274,36 +276,38 @@ struct DeviceRleDispatch
         int             ptx_version,
         KernelConfig&   device_rle_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_rle_config.template Init<PtxRleSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                device_rle_config.template Init<PtxRleSweepPolicy>();
+            #endif
         }
         else
         {
-            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+                }
+                else
+                {
+                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -415,7 +419,9 @@ struct DeviceRleDispatch
             if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(device_scan_init_kernel,
                 tile_status,
                 num_tiles,
                 d_num_runs_out);
@@ -452,7 +458,9 @@ struct DeviceRleDispatch
                 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
 
             // Invoke device_rle_sweep_kernel
-            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, device_rle_config.block_threads, 0, stream
+            ).doit(device_rle_sweep_kernel,
                 d_in,
                 d_offsets_out,
                 d_lengths_out,
@@ -498,11 +506,7 @@ struct DeviceRleDispatch
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel kernel dispatch configurations
             KernelConfig device_rle_config;
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index 667c631cd0..24b30f102c 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -44,6 +44,8 @@
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -367,7 +369,9 @@ struct DispatchScan:
             if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(init_kernel,
                 tile_state,
                 num_tiles);
 
@@ -398,7 +402,9 @@ struct DispatchScan:
                     start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
 
                 // Invoke scan_kernel
-                scan_kernel<<<scan_grid_size, Policy::BLOCK_THREADS, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
+                ).doit(scan_kernel,
                     d_in,
                     d_out,
                     tile_state,
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index ad80fb69b4..5fec4cff72 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -44,6 +44,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../util_device.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -275,37 +277,39 @@ struct DispatchSelectIf
         int             ptx_version,
         KernelConfig    &select_if_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        select_if_config.template Init<PtxSelectIfPolicyT>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)ptx_version;
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                select_if_config.template Init<PtxSelectIfPolicyT>();
+            #endif
         }
         else
         {
-            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 350)
+                {
+                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+                }
+                else if (ptx_version >= 130)
+                {
+                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+                }
+                else
+                {
+                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -417,7 +421,9 @@ struct DispatchSelectIf
             if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
 
             // Invoke scan_init_kernel to initialize tile descriptors
-            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                init_grid_size, INIT_KERNEL_THREADS, 0, stream
+            ).doit(scan_init_kernel,
                 tile_status,
                 num_tiles,
                 d_num_selected_out);
@@ -454,7 +460,9 @@ struct DispatchSelectIf
                 scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
 
             // Invoke select_if_kernel
-            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                scan_grid_size, select_if_config.block_threads, 0, stream
+            ).doit(select_if_kernel,
                 d_in,
                 d_flags,
                 d_selected_out,
@@ -501,11 +509,7 @@ struct DispatchSelectIf
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel kernel dispatch configurations
             KernelConfig select_if_config;
diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh
index 71d67c5f2e..4b7c5e0cf3 100644
--- a/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -47,6 +47,8 @@
 #include "../../grid/grid_queue.cuh"
 #include "../../config.cuh"
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -287,7 +289,7 @@ struct DispatchSpmv
     };
 
     /// SM20
-    struct Policy200 
+    struct Policy200
     {
         typedef AgentSpmvPolicy<
                 96,
@@ -314,7 +316,7 @@ struct DispatchSpmv
 
 
     /// SM30
-    struct Policy300 
+    struct Policy300
     {
         typedef AgentSpmvPolicy<
                 96,
@@ -376,7 +378,7 @@ struct DispatchSpmv
                 LOAD_LDG,
                 LOAD_LDG,
                 LOAD_LDG,
-                false, 
+                false,
                 BLOCK_SCAN_WARP_SCANS>
             SpmvPolicyT;
 
@@ -488,53 +490,55 @@ struct DispatchSpmv
         KernelConfig    &spmv_config,
         KernelConfig    &segment_fixup_config)
     {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        spmv_config.template Init<PtxSpmvPolicyT>();
-        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
+        if (CUB_IS_DEVICE_CODE)
         {
-            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 500)
-        {
-            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 370)
-        {
-            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 350)
-        {
-            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
-
-        }
-        else if (ptx_version >= 200)
-        {
-            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+            #if CUB_INCLUDE_DEVICE_CODE
+                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+                spmv_config.template Init<PtxSpmvPolicyT>();
+                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+            #endif
         }
         else
         {
-            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+            #if CUB_INCLUDE_HOST_CODE
+                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+                if (ptx_version >= 600)
+                {
+                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 500)
+                {
+                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 370)
+                {
+                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 350)
+                {
+                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 300)
+                {
+                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+                }
+                else if (ptx_version >= 200)
+                {
+                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+                }
+                else
+                {
+                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+                }
+            #endif
         }
-
-    #endif
     }
 
 
@@ -614,7 +618,10 @@ struct DispatchSpmv
                     degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
 
                 // Invoke spmv_search_kernel
-                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
+                    stream
+                ).doit(spmv_1col_kernel,
                     spmv_params);
 
                 // Check for failure to launch
@@ -700,10 +707,13 @@ struct DispatchSpmv
             int search_block_size   = INIT_KERNEL_THREADS;
             int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
 
-#if (CUB_PTX_ARCH == 0)
-            // Init textures
-            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
-#endif
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Init textures
+                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+                }
+            #endif
 
             if (search_grid_size < sm_count)
 //            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
@@ -720,7 +730,9 @@ struct DispatchSpmv
                     search_grid_size, search_block_size, (long long) stream);
 
                 // Invoke spmv_search_kernel
-                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    search_grid_size, search_block_size, 0, stream
+                ).doit(spmv_search_kernel,
                     num_merge_tiles,
                     d_tile_coordinates,
                     spmv_params);
@@ -737,7 +749,9 @@ struct DispatchSpmv
                 spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
 
             // Invoke spmv_kernel
-            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
+            thrust::cuda_cub::launcher::triple_chevron(
+                spmv_grid_size, spmv_config.block_threads, 0, stream
+            ).doit(spmv_kernel,
                 spmv_params,
                 d_tile_coordinates,
                 d_tile_carry_pairs,
@@ -759,7 +773,10 @@ struct DispatchSpmv
                     segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
 
                 // Invoke segment_fixup_kernel
-                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
+                thrust::cuda_cub::launcher::triple_chevron(
+                    segment_fixup_grid_size, segment_fixup_config.block_threads,
+                    0, stream
+                ).doit(segment_fixup_kernel,
                     d_tile_carry_pairs,
                     spmv_params.d_vector_y,
                     num_merge_tiles,
@@ -773,10 +790,13 @@ struct DispatchSpmv
                 if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
             }
 
-#if (CUB_PTX_ARCH == 0)
-            // Free textures
-            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
-#endif
+            #if CUB_INCLUDE_HOST_CODE
+                if (CUB_IS_HOST_CODE)
+                {
+                    // Free textures
+                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+                }
+            #endif
         }
         while (0);
 
@@ -802,11 +822,7 @@ struct DispatchSpmv
         {
             // Get PTX version
             int ptx_version = 0;
-    #if (CUB_PTX_ARCH == 0)
             if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
 
             // Get kernel kernel dispatch configurations
             KernelConfig spmv_config, segment_fixup_config;
diff --git a/cub/grid/grid_queue.cuh b/cub/grid/grid_queue.cuh
index 3d09e6e494..6b5f676b03 100644
--- a/cub/grid/grid_queue.cuh
+++ b/cub/grid/grid_queue.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -123,43 +123,61 @@ public:
         OffsetT fill_size,
         cudaStream_t stream = 0)
     {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = fill_size;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        OffsetT counters[2];
-        counters[FILL] = fill_size;
-        counters[DRAIN] = 0;
-        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
-#endif
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = fill_size;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                OffsetT counters[2];
+                counters[FILL] = fill_size;
+                counters[DRAIN] = 0;
+                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+            #endif
+        }
+        return result;
     }
 
 
     /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
     __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
     {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
-#endif
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[DRAIN] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
     }
 
 
     /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
     __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
     {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
-#endif
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                d_counters[FILL] = 0;
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+            #endif
+        }
+        return result;
     }
 
 
@@ -168,13 +186,19 @@ public:
         OffsetT &fill_size,
         cudaStream_t stream = 0)
     {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        fill_size = d_counters[FILL];
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
-#endif
+        cudaError_t result = cudaErrorUnknown;
+        if (CUB_IS_DEVICE_CODE) {
+            #if CUB_INCLUDE_DEVICE_CODE
+                (void)stream;
+                fill_size = d_counters[FILL];
+                result = cudaSuccess;
+            #endif
+        } else {
+            #if CUB_INCLUDE_HOST_CODE
+                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+            #endif
+        }
+        return result;
     }
 
 
diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh
index 60c7aa0adc..aae069edfa 100644
--- a/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/iterator/tex_obj_input_iterator.cuh
@@ -204,24 +204,32 @@ public:
     /// Indirection
     __host__ __device__ __forceinline__ reference operator*() const
     {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                // Simply dereference the pointer on the host
+                return ptr[tex_offset];
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Move array of uninitialized words, then alias and assign to return value
+                TextureWord words[TEXTURE_MULTIPLE];
+
+                #pragma unroll
+                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+                {
+                    words[i] = tex1Dfetch<TextureWord>(
+                        tex_obj,
+                        (tex_offset * TEXTURE_MULTIPLE) + i);
+                }
+
+                // Load from words
+                return *reinterpret_cast<T*>(words);
+            #else
+                // This is dead code which will never be executed.  It is here
+                // only to avoid warnings about missing return statements.
+                return ptr[tex_offset];
+            #endif
         }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
     }
 
     /// Addition
diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh
index 947aff4e23..e63650aae3 100644
--- a/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/iterator/tex_ref_input_iterator.cuh
@@ -279,13 +279,19 @@ public:
     /// Indirection
     __host__ __device__ __forceinline__ reference operator*() const
     {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Use the texture reference
-        return TexId::Fetch(tex_offset);
-#endif
+        if (CUB_IS_HOST_CODE) {
+            // Simply dereference the pointer on the host
+            return ptr[tex_offset];
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                // Use the texture reference
+                return TexId::Fetch(tex_offset);
+            #else
+                // This is dead code that will never be executed.  It is here
+                // only to avoid warnings about missing returns.
+                return ptr[tex_offset];
+            #endif
+        }
     }
 
     /// Addition
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index fb2df4fde2..00ce46d265 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -44,19 +44,44 @@ namespace cub {
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
+        !defined(CUB_USE_COOPERATIVE_GROUPS)
     #define CUB_USE_COOPERATIVE_GROUPS
 #endif
 
-/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
 #ifndef CUB_PTX_ARCH
-    #ifndef __CUDA_ARCH__
+    #if defined(__NVCOMPILER_CUDA__)
+        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+        // when compiling both host code and device code. Currently, only one
+        // PTX version can be targeted.
+        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+    #elif !defined(__CUDA_ARCH__)
         #define CUB_PTX_ARCH 0
     #else
         #define CUB_PTX_ARCH __CUDA_ARCH__
     #endif
 #endif
 
+#ifndef CUB_IS_DEVICE_CODE
+    #if defined(__NVCOMPILER_CUDA__)
+        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
+        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 1
+    #elif CUB_PTX_ARCH > 0
+        #define CUB_IS_DEVICE_CODE 1
+        #define CUB_IS_HOST_CODE 0
+        #define CUB_INCLUDE_DEVICE_CODE 1
+        #define CUB_INCLUDE_HOST_CODE 0
+    #else
+        #define CUB_IS_DEVICE_CODE 0
+        #define CUB_IS_HOST_CODE 1
+        #define CUB_INCLUDE_DEVICE_CODE 0
+        #define CUB_INCLUDE_HOST_CODE 1
+    #endif
+#endif
 
 /// Maximum number of devices supported.
 #ifndef CUB_MAX_DEVICES
diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
index acbc1df10e..354eab6cb6 100644
--- a/cub/util_debug.cuh
+++ b/cub/util_debug.cuh
@@ -82,12 +82,16 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
 #ifdef CUB_STDERR
     if (error)
     {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-    #endif
+        if (CUB_IS_HOST_CODE) {
+            #if CUB_INCLUDE_HOST_CODE
+                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+                fflush(stderr);
+            #endif
+        } else {
+            #if CUB_INCLUDE_DEVICE_CODE
+                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+            #endif
+        }
     }
 #endif
     return error;
@@ -114,7 +118,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
  * \brief Log macro for printf statements.
  */
 #if !defined(_CubLog)
-    #if !(defined(__clang__) && defined(__CUDA__))
+    #if defined(__NVCOMPILER_CUDA__)
+        #define _CubLog(format, ...) (__builtin_is_device_code() \
+            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                     blockIdx.z, blockIdx.y, blockIdx.x, \
+                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
+            : printf(format, __VA_ARGS__));
+    #elif !(defined(__clang__) && defined(__CUDA__))
         #if (CUB_PTX_ARCH == 0)
             #define _CubLog(format, ...) printf(format,__VA_ARGS__);
         #elif (CUB_PTX_ARCH >= 200)
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index bfef8b7a76..0cbdca00b2 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -348,32 +348,29 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_ver
 
     (void)(empty_kernel);
 
-#if (CUB_PTX_ARCH == 0) // Host code.
-
-    cudaError_t error = cudaSuccess;
-    cudaFuncAttributes empty_kernel_attrs;
-
-    do {
-        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
-            break;
+    cudaError_t result = cudaSuccess;
+    if (CUB_IS_HOST_CODE) {
+       #if CUB_INCLUDE_HOST_CODE
+            cudaFuncAttributes empty_kernel_attrs;
+
+            do {
+                if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
+                    break;
+            }
+            while(0);
+
+            ptx_version = empty_kernel_attrs.ptxVersion * 10;
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
+            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
+            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+            ptx_version = CUB_PTX_ARCH;
+        #endif
     }
-    while (0);
-
-    ptx_version = empty_kernel_attrs.ptxVersion * 10;
-
-    return error;
-
-#else // Device code.
-
-    // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
-    // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
-    (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
-
-    ptx_version = CUB_PTX_ARCH;
-
-    return cudaSuccess;
-
-#endif
+    return result;
 }
 
 /**
@@ -436,26 +433,35 @@ __host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
 {
-#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11.
-
-    auto const device = CurrentDevice();
-
-    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-      // If this call fails, then we get the error code back in the payload,
-      // which we check with `CubDebug` below.
-      [=] (int& pv) { return PtxVersionUncached(pv, device); },
-      device);
-
-    if (!CubDebug(payload.error))
-        ptx_version = payload.attribute;
-
-    return payload.error;
-
-#else // Device code or host code before C++11.
-
-    return PtxVersionUncached(ptx_version);
-
-#endif
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if __cplusplus >= 201103L
+                // Host code and C++11.
+                auto const device = CurrentDevice();
+
+                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    ptx_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98.
+                result = PtxVersionUncached(ptx_version);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            // Device code.
+            result = PtxVersionUncached(ptx_version);
+        #endif
+    }
+    return result;
 }
 
 /**
@@ -497,24 +503,32 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_versi
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
 {
-#if __cplusplus >= 201103L && (CUB_PTX_ARCH == 0) // Host code and C++11.
-
-    auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
-      // If this call fails, then we get the error code back in the payload,
-      // which we check with `CubDebug` below.
-      [=] (int& pv) { return SmVersionUncached(pv, device); },
-      device);
-
-    if (!CubDebug(payload.error))
-        sm_version = payload.attribute;
-
-    return payload.error;
-
-#else // Device code or host code before C++11.
-
-    return SmVersionUncached(sm_version, device);
-
-#endif
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            #if __cplusplus >= 201103L
+                // Host code and C++11
+                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+                  // If this call fails, then we get the error code back in the payload,
+                  // which we check with `CubDebug` below.
+                  [=] (int& pv) { return SmVersionUncached(pv, device); },
+                  device);
+
+                if (!CubDebug(payload.error))
+                    sm_version = payload.attribute;
+
+                result = payload.error;
+            #else
+                // Host code and C++98
+                result = SmVersionUncached(sm_version, device);
+            #endif
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            result = SmVersionUncached(sm_version, device);
+        #endif
+    }
+    return result;
 }
 
 /**
@@ -522,23 +536,25 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int
  */
 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
 {
-#if (CUB_PTX_ARCH == 0) // Host code.
-
-    return CubDebug(cudaStreamSynchronize(stream));
-
-#elif defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
-
-    (void)stream;
-    // Device can't yet sync on a specific stream
-    return CubDebug(cudaDeviceSynchronize());
-
-#else // Device code without the CUDA runtime.
-
-    (void)stream;
-    // CUDA API calls are not supported from this device.
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#endif
+    cudaError_t result = cudaErrorUnknown;
+    if (CUB_IS_HOST_CODE) {
+        #if CUB_INCLUDE_HOST_CODE
+            result = CubDebug(cudaStreamSynchronize(stream));
+        #endif
+    } else {
+        #if CUB_INCLUDE_DEVICE_CODE
+            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
+                (void)stream;
+                // Device can't yet sync on a specific stream
+                result = CubDebug(cudaDeviceSynchronize());
+            #else // Device code without the CUDA runtime.
+                (void)stream;
+                // CUDA API calls are not supported from this device.
+                result = CubDebug(cudaErrorInvalidConfiguration);
+            #endif
+        #endif
+    }
+    return result;
 }
 
 
diff --git a/cub/util_ptx.cuh b/cub/util_ptx.cuh
index 582ca0d8b8..3f20c11beb 100644
--- a/cub/util_ptx.cuh
+++ b/cub/util_ptx.cuh
@@ -90,12 +90,8 @@ __device__ __forceinline__ unsigned int SHR_ADD(
     unsigned int addend)
 {
     unsigned int ret;
-#if CUB_PTX_ARCH >= 200
     asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
         "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
     return ret;
 }
 
@@ -109,12 +105,8 @@ __device__ __forceinline__ unsigned int SHL_ADD(
     unsigned int addend)
 {
     unsigned int ret;
-#if CUB_PTX_ARCH >= 200
     asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
         "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
     return ret;
 }
 
@@ -131,12 +123,7 @@ __device__ __forceinline__ unsigned int BFE(
     Int2Type<BYTE_LEN>      /*byte_len*/)
 {
     unsigned int bits;
-#if CUB_PTX_ARCH >= 200
     asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
     return bits;
 }
 
@@ -180,15 +167,8 @@ __device__ __forceinline__ void BFI(
     unsigned int bit_start,
     unsigned int num_bits)
 {
-#if CUB_PTX_ARCH >= 200
     asm ("bfi.b32 %0, %1, %2, %3, %4;" :
         "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    x <<= bit_start;
-    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
-    unsigned int MASK_Y = ~MASK_X;
-    ret = (y & MASK_Y) | (x & MASK_X);
-#endif
 }
 
 
@@ -197,11 +177,7 @@ __device__ __forceinline__ void BFI(
  */
 __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
 {
-#if CUB_PTX_ARCH >= 200
     asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
     return x;
 }