NVIDIA · alliepiper · May 17, 2022 · Mar 23, 2022 · Mar 26, 2021 · Mar 26, 2021
diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
@@ -103,7 +103,7 @@ template <
     typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
     typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
     typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>            ///< PTX compute capability (unused)
 struct AgentHistogram
 {
     //---------------------------------------------------------------------
@@ -562,15 +562,10 @@ struct AgentHistogram
             is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
 
         // Accumulate samples
-#if CUB_PTX_ARCH >= 120
         if (prefer_smem)
             AccumulateSmemPixels(samples, is_valid);
         else
             AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
     }
 
 

diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh
@@ -117,7 +117,7 @@ struct AgentRle
     // Constants
     enum
     {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS            = CUB_WARP_THREADS(0),
         BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
         ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
         WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,

diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh
@@ -111,8 +111,7 @@ struct AgentSegmentFixup
       TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
 
       // Whether or not do fixup using RLE + global atomics
-      USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) &&
-                         (std::is_same<ValueT, float>::value ||
+      USE_ATOMIC_FIXUP = (std::is_same<ValueT, float>::value ||
                           std::is_same<ValueT, int>::value ||
                           std::is_same<ValueT, unsigned int>::value ||
                           std::is_same<ValueT, unsigned long long>::value),

diff --git a/cub/agent/agent_spmv_orig.cuh b/cub/agent/agent_spmv_orig.cuh
@@ -115,7 +115,7 @@ template <
     typename    OffsetT,                    ///< Signed integer type for sequence offsets
     bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
     bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+    int         LEGACY_PTX_ARCH = 0>        ///< PTX compute capability (unused)
 struct AgentSpmv
 {
     //---------------------------------------------------------------------

diff --git a/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -33,6 +33,8 @@
 #include <cub/warp/warp_merge_sort.cuh>
 #include <cub/warp/warp_store.cuh>
 
+#include <nv/target>
+
 #include <thrust/system/cuda/detail/core/util.h>
 
 
@@ -108,6 +110,23 @@ class AgentSubWarpSort
   {
     template <typename T>
     __device__ bool operator()(T lhs, T rhs)
+    {
+      return this->impl(lhs, rhs);
+    }
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+    __device__ bool operator()(__half lhs, __half rhs)
+    {
+      // Need to explicitly cast to float for SM <= 52.
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (return this->impl(lhs, rhs);),
+                   (return this->impl(__half2float(lhs), __half2float(rhs));));
+    }
+#endif
+
+  private:
+    template <typename T>
+    __device__ bool impl(T lhs, T rhs)
     {
       if (IS_DESCENDING)
       {
@@ -118,19 +137,15 @@ class AgentSubWarpSort
         return lhs < rhs;
       }
     }
-
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
-    __device__ bool operator()(__half lhs, __half rhs)
-    {
-      return (*this)(__half2float(lhs), __half2float(rhs));
-    }
-#endif
   };
 
-#if defined(__CUDA_FP16_TYPES_EXIST__) && (CUB_PTX_ARCH < 530)
+#if defined(__CUDA_FP16_TYPES_EXIST__)
   __device__ static bool equal(__half lhs, __half rhs)
   {
-    return __half2float(lhs) == __half2float(rhs);
+    // Need to explicitly cast to float for SM <= 52.
+    NV_IF_TARGET(NV_PROVIDES_SM_53,
+                 (return lhs == rhs;),
+                 (return __half2float(lhs) == __half2float(rhs);));
   }
 #endif
 

diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
@@ -666,11 +666,11 @@ template <
     typename    T,
     typename    ScanOpT,
     typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 struct TilePrefixCallbackOp
 {
     // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS> WarpReduceT;
 
     // Temporary storage type
     struct _TempStorage

diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh
@@ -41,7 +41,6 @@
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
  * @brief BlockAdjacentDifference provides
  *        [<em>collective</em>](index.html#sec0) methods for computing the
@@ -125,9 +124,9 @@ CUB_NAMESPACE_BEGIN
  */
 template <typename T,
           int BLOCK_DIM_X,
-          int BLOCK_DIM_Y = 1,
-          int BLOCK_DIM_Z = 1,
-          int PTX_ARCH    = CUB_PTX_ARCH>
+          int BLOCK_DIM_Y     = 1,
+          int BLOCK_DIM_Z     = 1,
+          int LEGACY_PTX_ARCH = 0>
 class BlockAdjacentDifference
 {
 private:

diff --git a/cub/block/block_discontinuity.cuh b/cub/block/block_discontinuity.cuh
@@ -47,7 +47,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
  * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
@@ -107,7 +107,7 @@ template <
     int         BLOCK_DIM_X,
     int         BLOCK_DIM_Y     = 1,
     int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH = 0>
 class BlockDiscontinuity
 {
 private:

diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
@@ -50,7 +50,7 @@ CUB_NAMESPACE_BEGIN
  * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - It is commonplace for blocks of threads to rearrange data items between
@@ -114,7 +114,7 @@ template <
     bool        WARP_TIME_SLICING   = false,
     int         BLOCK_DIM_Y         = 1,
     int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
+    int         LEGACY_PTX_ARCH     = 0>
 class BlockExchange
 {
 private:
@@ -129,11 +129,11 @@ private:
         /// The thread block size in threads
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(0),
         SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
 
         TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
@@ -1126,4 +1126,3 @@ public:
 
 
 CUB_NAMESPACE_END
-
diff --git a/cub/block/block_histogram.cuh b/cub/block/block_histogram.cuh
@@ -94,7 +94,7 @@ enum BlockHistogramAlgorithm
  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
@@ -160,7 +160,7 @@ template <
     BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
     int                     BLOCK_DIM_Y         = 1,
     int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH     = 0>
 class BlockHistogram
 {
 private:
@@ -176,27 +176,15 @@ private:
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
     /// Internal specialization.
     using InternalBlockHistogram =
-      cub::detail::conditional_t<SAFE_ALGORITHM == BLOCK_HISTO_SORT,
+      cub::detail::conditional_t<ALGORITHM == BLOCK_HISTO_SORT,
                                  BlockHistogramSort<T,
                                                     BLOCK_DIM_X,
                                                     ITEMS_PER_THREAD,
                                                     BINS,
                                                     BLOCK_DIM_Y,
-                                                    BLOCK_DIM_Z,
-                                                    PTX_ARCH>,
+                                                    BLOCK_DIM_Z>,
                                  BlockHistogramAtomic<BINS>>;
 
     /// Shared memory storage layout type for BlockHistogram

diff --git a/cub/block/block_load.cuh b/cub/block/block_load.cuh
@@ -568,7 +568,7 @@ enum BlockLoadAlgorithm
  * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * - The BlockLoad class provides a single data movement abstraction that can be specialized
@@ -638,7 +638,7 @@ template <
     BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
     int                 BLOCK_DIM_Y         = 1,
     int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
+    int                 LEGACY_PTX_ARCH     = 0>
 class BlockLoad
 {
 private:
@@ -860,7 +860,7 @@ private:
     struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
     {
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -928,14 +928,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage
@@ -1003,14 +1003,14 @@ private:
     {
         enum
         {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+            WARP_THREADS = CUB_WARP_THREADS(0)
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
         CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockExchange;
 
         /// Shared memory storage layout type
         struct _TempStorage : BlockExchange::TempStorage

diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
@@ -104,7 +104,7 @@ struct BlockRadixRankEmptyCallback
  * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
  *
  * \par Overview
  * Blah...
@@ -138,12 +138,12 @@ template <
     int                     BLOCK_DIM_X,
     int                     RADIX_BITS,
     bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    bool                    MEMOIZE_OUTER_SCAN      = true,
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
     int                     BLOCK_DIM_Y             = 1,
     int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH         = 0>
 class BlockRadixRank
 {
 private:
@@ -168,7 +168,7 @@ private:
 
         RADIX_DIGITS                = 1 << RADIX_BITS,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
@@ -203,8 +203,7 @@ private:
             BLOCK_DIM_X,
             INNER_SCAN_ALGORITHM,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockScan;
 
 
@@ -508,7 +507,7 @@ template <
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     int                     BLOCK_DIM_Y             = 1,
     int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
+    int                     LEGACY_PTX_ARCH         = 0>
 class BlockRadixRankMatch
 {
 private:
@@ -527,7 +526,7 @@ private:
 
         RADIX_DIGITS                = 1 << RADIX_BITS,
 
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(0),
         WARP_THREADS                = 1 << LOG_WARP_THREADS,
         WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 
@@ -558,8 +557,7 @@ private:
             BLOCK_THREADS,
             INNER_SCAN_ALGORITHM,
             BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
+            BLOCK_DIM_Z>
         BlockScanT;