diff --git a/cub/block/block_adjacent_difference.cuh b/cub/block/block_adjacent_difference.cuh index e33ffd0dc7..14dab12881 100644 --- a/cub/block/block_adjacent_difference.cuh +++ b/cub/block/block_adjacent_difference.cuh @@ -550,7 +550,7 @@ public: * @param[in] difference_op * Binary difference operator * - * @param[in] + * @param[in] valid_items * Number of valid items in thread block */ template thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 + T& output, ///< [out] The \p input item from thread thread(i+distance>)%BLOCK_THREADS (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) { temp_storage[linear_tid] = input; diff --git a/cub/device/device_partition.cuh b/cub/device/device_partition.cuh index 6276d776c3..a0b044b3d1 100644 --- a/cub/device/device_partition.cuh +++ b/cub/device/device_partition.cuh @@ -548,6 +548,24 @@ struct DevicePartition * `d_num_selected_out[0]` and total number of items selected by * @p select_second_part_op is stored as `d_num_selected_out[1]`, * respectively + * + * @param[in] num_items + * Total number of items to select from + * + * @param[in] select_first_part_op + * Unary selection operator to select @p d_first_part_out + * + * @param[in] select_second_part_op + * Unary selection operator to select @p d_second_part_out + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. + * Default is stream0. + * + * @param[in] debug_synchronous + * **[optional]** Whether or not to synchronize the stream after every + * kernel launch to check for errors. May cause significant slowdown. + * Default is @p false. */ template [inferred] Random-access input iterator type for reading scan keys inputs \iterator * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ @@ -625,7 +625,7 @@ struct DeviceScan * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) * \tparam InitValueT [inferred] Type of the \p init_value value used in Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ @@ -711,7 +711,7 @@ struct DeviceScan * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading scan keys inputs \iterator * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator - * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ @@ -818,7 +818,7 @@ struct DeviceScan * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading scan values inputs \iterator * \tparam ValuesOutputIteratorT [inferred] Random-access output iterator type for writing scan values outputs \iterator * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam EqualityOpT [inferred][/b] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys + * \tparam EqualityOpT [inferred] Functor type having member T operator()(const T &a, const T &b) for binary operations that defines the equality of keys * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ diff --git a/cub/util_device.cuh b/cub/util_device.cuh index 2e86cc05b1..403c2f0d40 100644 --- a/cub/util_device.cuh +++ b/cub/util_device.cuh @@ -613,7 +613,7 @@ cudaError_t MaxSmOccupancy( int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy int block_threads, ///< [in] Number of threads per thread block - int dynamic_smem_bytes = 0) + int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0. { #ifndef CUB_RUNTIME_ENABLED diff --git a/cub/version.cuh b/cub/version.cuh index bcda06ae60..08d6ccbc88 100644 --- a/cub/version.cuh +++ b/cub/version.cuh @@ -25,7 +25,7 @@ * ******************************************************************************/ -/*! \file version.h +/*! \file version.cuh * \brief Compile-time macros encoding CUB release version * * is the only CUB header that is guaranteed to diff --git a/cub/warp/warp_load.cuh b/cub/warp/warp_load.cuh index b7a507ca34..28a31f407a 100644 --- a/cub/warp/warp_load.cuh +++ b/cub/warp/warp_load.cuh @@ -658,7 +658,7 @@ public: * thread_data, * valid_items, * -1); - * @endcod + * @endcode * @par * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, ..., @p valid_items * is @p 5, and the out-of-bounds default is @p -1.