diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h index a0da41624..059e16627 100644 --- a/thrust/system/cuda/config.h +++ b/thrust/system/cuda/config.h @@ -28,6 +28,10 @@ #include +// We don't directly include since it doesn't exist in +// older releases. This header will always pull in version info: +#include + #if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__) # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) # define __THRUST_HAS_CUDART__ 1 @@ -69,9 +73,37 @@ #endif #ifndef THRUST_IGNORE_CUB_VERSION_CHECK + #include -#include // This includes in newer releases. #if THRUST_VERSION != CUB_VERSION #error The version of CUB in your include path is not compatible with this release of Thrust. CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. Define THRUST_IGNORE_CUB_VERSION_CHECK to ignore this. #endif + +// Make sure the CUB namespace has been declared using the modern macros: +CUB_NAMESPACE_BEGIN +CUB_NAMESPACE_END + +#else // THRUST_IGNORE_CUB_VERSION_CHECK + +// Make sure the CUB namespace has been declared. Use the old macros for compat +// with older CUB: +CUB_NS_PREFIX +namespace cub {} +CUB_NS_POSTFIX + +// Older versions of CUB do not define this. Set it to a reasonable default if +// not provided. +#ifndef CUB_NS_QUALIFIER +#define CUB_NS_QUALIFIER ::cub #endif + +#endif // THRUST_IGNORE_CUB_VERSION_CHECK + +// Pull the fully qualified cub:: namespace into the thrust:: namespace so we +// don't have to use CUB_NS_QUALIFIER as long as we're in thrust::. +THRUST_NAMESPACE_BEGIN +namespace cub +{ +using namespace CUB_NS_QUALIFIER; +} +THRUST_NAMESPACE_END diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index f942e3a5b..a23390e6c 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -65,9 +65,9 @@ namespace __adjacent_difference { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_DEFAULT, + cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct PtxPolicy { enum @@ -77,9 +77,9 @@ namespace __adjacent_difference { ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template @@ -115,9 +115,9 @@ namespace __adjacent_difference { }; typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; template @@ -131,9 +131,9 @@ namespace __adjacent_difference { }; typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; @@ -159,11 +159,11 @@ namespace __adjacent_difference { typedef typename core::BlockStore::type BlockStore; - typedef CUB_NS_QUALIFIER::BlockAdjacentDifference + typedef cub::BlockAdjacentDifference BlockAdjacentDifference; union TempStorage @@ -396,7 +396,7 @@ namespace __adjacent_difference { Size tile_size = difference_plan.items_per_tile; - Size num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t tmp1 = num_tiles * sizeof(input_type); size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size, diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h index 0f35249b6..8735f7419 100644 --- a/thrust/system/cuda/detail/async/exclusive_scan.h +++ b/thrust/system/cuda/detail/async/exclusive_scan.h @@ -74,16 +74,16 @@ async_exclusive_scan_n(execution_policy& policy, InitialValueType init, BinaryOp op) { - using Dispatch32 = CUB_NS_QUALIFIER::DispatchScan; - using Dispatch64 = CUB_NS_QUALIFIER::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; auto const device_alloc = get_async_device_allocator(policy); unique_eager_event ev; diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h index 8321141a4..4b916be5b 100644 --- a/thrust/system/cuda/detail/async/inclusive_scan.h +++ b/thrust/system/cuda/detail/async/inclusive_scan.h @@ -72,15 +72,15 @@ async_inclusive_scan_n(execution_policy& policy, OutputIt out, BinaryOp op) { - using Dispatch32 = CUB_NS_QUALIFIER::DispatchScan; - using Dispatch64 = CUB_NS_QUALIFIER::DispatchScan; auto const device_alloc = get_async_device_allocator(policy); @@ -99,7 +99,7 @@ async_inclusive_scan_n(execution_policy& policy, first, out, op, - CUB_NS_QUALIFIER::NullType{}, + cub::NullType{}, n_fixed, nullptr, THRUST_DEBUG_SYNC_FLAG)); @@ -146,7 +146,7 @@ async_inclusive_scan_n(execution_policy& policy, first, out, op, - CUB_NS_QUALIFIER::NullType{}, + cub::NullType{}, n_fixed, user_raw_stream, THRUST_DEBUG_SYNC_FLAG)); diff --git a/thrust/system/cuda/detail/async/reduce.h b/thrust/system/cuda/detail/async/reduce.h index efd08b743..03e3dfd1a 100644 --- a/thrust/system/cuda/detail/async/reduce.h +++ b/thrust/system/cuda/detail/async/reduce.h @@ -80,7 +80,7 @@ auto async_reduce_n( size_t tmp_size = 0; thrust::cuda_cub::throw_on_error( - CUB_NS_QUALIFIER::DeviceReduce::Reduce( + cub::DeviceReduce::Reduce( nullptr , tmp_size , first @@ -162,7 +162,7 @@ auto async_reduce_n( // Run reduction. thrust::cuda_cub::throw_on_error( - CUB_NS_QUALIFIER::DeviceReduce::Reduce( + cub::DeviceReduce::Reduce( tmp_ptr , tmp_size , first @@ -233,7 +233,7 @@ auto async_reduce_into_n( size_t tmp_size = 0; thrust::cuda_cub::throw_on_error( - CUB_NS_QUALIFIER::DeviceReduce::Reduce( + cub::DeviceReduce::Reduce( nullptr , tmp_size , first @@ -297,7 +297,7 @@ auto async_reduce_into_n( // Run reduction. thrust::cuda_cub::throw_on_error( - CUB_NS_QUALIFIER::DeviceReduce::Reduce( + cub::DeviceReduce::Reduce( tmp_ptr , tmp_size , first diff --git a/thrust/system/cuda/detail/async/sort.h b/thrust/system/cuda/detail/async/sort.h index 12c78292a..e8f92d7f7 100644 --- a/thrust/system/cuda/detail/async/sort.h +++ b/thrust/system/cuda/detail/async/sort.h @@ -293,12 +293,12 @@ invoke_radix_sort( cudaStream_t stream , void* tmp_ptr , std::size_t& tmp_size -, CUB_NS_QUALIFIER::DoubleBuffer& keys +, cub::DoubleBuffer& keys , Size& n , StrictWeakOrdering ) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys( + return cub::DeviceRadixSort::SortKeys( tmp_ptr , tmp_size , keys @@ -319,12 +319,12 @@ invoke_radix_sort( cudaStream_t stream , void* tmp_ptr , std::size_t& tmp_size -, CUB_NS_QUALIFIER::DoubleBuffer& keys +, cub::DoubleBuffer& keys , Size& n , StrictWeakOrdering ) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortKeysDescending( + return cub::DeviceRadixSort::SortKeysDescending( tmp_ptr , tmp_size , keys @@ -366,7 +366,7 @@ auto async_stable_sort_n( unique_eager_event e; - CUB_NS_QUALIFIER::DoubleBuffer keys( + cub::DoubleBuffer keys( raw_pointer_cast(&*first), nullptr ); diff --git a/thrust/system/cuda/detail/binary_search.h b/thrust/system/cuda/detail/binary_search.h index 41ee6cd60..3400515dc 100644 --- a/thrust/system/cuda/detail/binary_search.h +++ b/thrust/system/cuda/detail/binary_search.h @@ -187,9 +187,9 @@ namespace __binary_search { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct PtxPolicy { enum @@ -199,9 +199,9 @@ namespace __binary_search { ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; // PtxPolicy template @@ -218,9 +218,9 @@ namespace __binary_search { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_TRANSPOSE> type; }; @@ -237,9 +237,9 @@ namespace __binary_search { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; diff --git a/thrust/system/cuda/detail/copy_if.h b/thrust/system/cuda/detail/copy_if.h index b3000a928..cd20b296a 100644 --- a/thrust/system/cuda/detail/copy_if.h +++ b/thrust/system/cuda/detail/copy_if.h @@ -72,9 +72,9 @@ namespace __copy_if { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -83,9 +83,9 @@ namespace __copy_if { ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // struct PtxPolicy template @@ -104,9 +104,9 @@ namespace __copy_if { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning<350> @@ -124,9 +124,9 @@ namespace __copy_if { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning<350> @@ -143,9 +143,9 @@ namespace __copy_if { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning<300> @@ -162,7 +162,7 @@ namespace __copy_if { typedef typename iterator_traits::value_type item_type; typedef typename iterator_traits::value_type stencil_type; - typedef CUB_NS_QUALIFIER::ScanTileState ScanTileState; + typedef cub::ScanTileState ScanTileState; template struct PtxPlan : Tuning::type @@ -175,13 +175,13 @@ namespace __copy_if { typedef typename core::BlockLoad::type BlockLoadItems; typedef typename core::BlockLoad::type BlockLoadStencil; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan::type copy_if_plan = copy_if_agent::get_plan(stream); int tile_size = copy_if_plan.items_per_tile; - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_size = core::vshmem_size(copy_if_plan.shared_memory_size, num_tiles); @@ -653,7 +653,7 @@ namespace __copy_if { void* allocations[2] = {NULL, NULL}; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index f7243a6ba..836f05872 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -536,7 +536,7 @@ namespace core { max_blocks_per_sm_impl(K k, int block_threads) { int occ; - cudaError_t status = CUB_NS_QUALIFIER::MaxSmOccupancy(occ, k, block_threads); + cudaError_t status = cub::MaxSmOccupancy(occ, k, block_threads); return cuda_optional(status == cudaSuccess ? occ : -1, status); } diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h index 201cec31f..cb4154aec 100644 --- a/thrust/system/cuda/detail/core/util.h +++ b/thrust/system/cuda/detail/core/util.h @@ -418,7 +418,7 @@ namespace core { #ifdef __CUDA_ARCH__ plan = get_agent_plan_dev(); #else - static CUB_NS_QUALIFIER::Mutex mutex; + static cub::Mutex mutex; bool lock = false; if (d_ptr == 0) { @@ -531,10 +531,10 @@ namespace core { typedef typename thrust::detail::conditional< is_contiguous_iterator::value, - CUB_NS_QUALIFIER::CacheModifiedInputIterator, - It>::type type; + cub::CacheModifiedInputIterator, + It>::type type; }; // struct Iterator template @@ -573,13 +573,13 @@ namespace core { class T = typename iterator_traits::value_type> struct BlockLoad { - using type = CUB_NS_QUALIFIER::BlockLoad::type::ver>; + using type = cub::BlockLoad::type::ver>; }; // BlockStore @@ -590,13 +590,13 @@ namespace core { class T = typename iterator_traits::value_type> struct BlockStore { - using type = CUB_NS_QUALIFIER::BlockStore::type::ver>; + using type = cub::BlockStore::type::ver>; }; // cuda_optional @@ -632,25 +632,25 @@ namespace core { get_ptx_version() { int ptx_version = 0; - cudaError_t status = CUB_NS_QUALIFIER::PtxVersion(ptx_version); + cudaError_t status = cub::PtxVersion(ptx_version); return cuda_optional(ptx_version, status); } cudaError_t THRUST_RUNTIME_FUNCTION sync_stream(cudaStream_t stream) { - return CUB_NS_QUALIFIER::SyncStream(stream); + return cub::SyncStream(stream); } inline void __device__ sync_threadblock() { - CUB_NS_QUALIFIER::CTA_SYNC(); + cub::CTA_SYNC(); } #define CUDA_CUB_RET_IF_FAIL(e) \ { \ auto const error = (e); \ - if (CUB_NS_QUALIFIER::Debug(error, __FILE__, __LINE__)) return error; \ + if (cub::Debug(error, __FILE__, __LINE__)) return error; \ } // uninitialized @@ -660,7 +660,7 @@ namespace core { template struct uninitialized { - typedef typename CUB_NS_QUALIFIER::UnitWord::DeviceWord DeviceWord; + typedef typename cub::UnitWord::DeviceWord DeviceWord; enum { @@ -752,10 +752,10 @@ namespace core { void* (&allocations)[ALLOCATIONS], size_t (&allocation_sizes)[ALLOCATIONS]) { - return CUB_NS_QUALIFIER::AliasTemporaries(storage_ptr, - storage_size, - allocations, - allocation_sizes); + return cub::AliasTemporaries(storage_ptr, + storage_size, + allocations, + allocation_sizes); } diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h index 499046f9b..0937beb8b 100644 --- a/thrust/system/cuda/detail/extrema.h +++ b/thrust/system/cuda/detail/extrema.h @@ -206,8 +206,8 @@ namespace __extrema { template get_max_blocks_per_sm, - CUB_NS_QUALIFIER::GridQueue, + cub::GridEvenShare, + cub::GridQueue, ReductionOp>(reduce_plan); CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status()); @@ -218,7 +218,7 @@ namespace __extrema { int sm_oversubscription = 5; int max_blocks = reduce_device_occupancy * sm_oversubscription; - CUB_NS_QUALIFIER::GridEvenShare even_share; + cub::GridEvenShare even_share; even_share.DispatchInit(num_items, max_blocks, reduce_plan.items_per_tile); @@ -233,13 +233,13 @@ namespace __extrema { size_t allocation_sizes[3] = { max_blocks * sizeof(T), // bytes needed for privatized block reductions - CUB_NS_QUALIFIER::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 + cub::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 vshmem_size // size of virtualized shared memory storage }; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); + status = cub::AliasTemporaries(d_temp_storage, + temp_storage_bytes, + allocations, + allocation_sizes); CUDA_CUB_RET_IF_FAIL(status); if (d_temp_storage == NULL) { @@ -247,21 +247,21 @@ namespace __extrema { } T *d_block_reductions = (T*) allocations[0]; - CUB_NS_QUALIFIER::GridQueue queue(allocations[1]); + cub::GridQueue queue(allocations[1]); char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL; // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = 0; - if (reduce_plan.grid_mapping == CUB_NS_QUALIFIER::GRID_MAPPING_RAKE) + if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE) { // Work is distributed evenly reduce_grid_size = even_share.grid_size; } - else if (reduce_plan.grid_mapping == CUB_NS_QUALIFIER::GRID_MAPPING_DYNAMIC) + else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC) { // Work is distributed dynamically - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, reduce_plan.items_per_tile); + size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile); // if not enough to fill the device with threadblocks // then fill the device with threadblocks diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h index 121a76637..ac5b0f871 100644 --- a/thrust/system/cuda/detail/malloc_and_free.h +++ b/thrust/system/cuda/detail/malloc_and_free.h @@ -36,9 +36,9 @@ namespace cuda_cub { #ifdef THRUST_CACHING_DEVICE_MALLOC #define __CUB_CACHING_MALLOC #ifndef __CUDA_ARCH__ -inline CUB_NS_QUALIFIER::CachingDeviceAllocator &get_allocator() +inline cub::CachingDeviceAllocator &get_allocator() { - static CUB_NS_QUALIFIER::CachingDeviceAllocator g_allocator(true); + static cub::CachingDeviceAllocator g_allocator(true); return g_allocator; } #endif @@ -56,7 +56,7 @@ void *malloc(execution_policy &, std::size_t n) if (THRUST_IS_HOST_CODE) { #if THRUST_INCLUDE_HOST_CODE #ifdef __CUB_CACHING_MALLOC - CUB_NS_QUALIFIER::CachingDeviceAllocator &alloc = get_allocator(); + cub::CachingDeviceAllocator &alloc = get_allocator(); cudaError_t status = alloc.DeviceAllocate(&result, n); #else cudaError_t status = cudaMalloc(&result, n); @@ -85,7 +85,7 @@ void free(execution_policy &, Pointer ptr) if (THRUST_IS_HOST_CODE) { #if THRUST_INCLUDE_HOST_CODE #ifdef __CUB_CACHING_MALLOC - CUB_NS_QUALIFIER::CachingDeviceAllocator &alloc = get_allocator(); + cub::CachingDeviceAllocator &alloc = get_allocator(); cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr)); #else cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr)); diff --git a/thrust/system/cuda/detail/merge.h b/thrust/system/cuda/detail/merge.h index 0cb3a20fe..7f49f4522 100644 --- a/thrust/system/cuda/detail/merge.h +++ b/thrust/system/cuda/detail/merge.h @@ -129,9 +129,9 @@ namespace __merge { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct PtxPolicy { enum @@ -141,9 +141,9 @@ namespace __merge { ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; // PtxPolicy template + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm300 @@ -242,9 +242,9 @@ namespace __merge { typedef PtxPolicy<512, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm52 @@ -260,9 +260,9 @@ namespace __merge { typedef PtxPolicy<512, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm52 @@ -280,9 +280,9 @@ namespace __merge { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm350 diff --git a/thrust/system/cuda/detail/partition.h b/thrust/system/cuda/detail/partition.h index 8065f0fd4..85d9bb813 100644 --- a/thrust/system/cuda/detail/partition.h +++ b/thrust/system/cuda/detail/partition.h @@ -53,9 +53,9 @@ namespace __partition { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -64,9 +64,9 @@ namespace __partition { ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // struct PtxPolicy template @@ -85,9 +85,9 @@ namespace __partition { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning<350> @@ -104,9 +104,9 @@ namespace __partition { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning<300> @@ -137,7 +137,7 @@ namespace __partition { typedef typename iterator_traits::value_type stencil_type; - typedef CUB_NS_QUALIFIER::ScanTileState ScanTileState; + typedef cub::ScanTileState ScanTileState; template struct PtxPlan : Tuning::type @@ -150,17 +150,17 @@ namespace __partition { typedef typename core::BlockLoad::type BlockLoadItems; typedef typename core::BlockLoad::type BlockLoadStencil; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan + typedef cub::BlockScan BlockScan; @@ -441,7 +441,7 @@ namespace __partition { { TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, - CUB_NS_QUALIFIER::Sum(), + cub::Sum(), tile_idx); BlockScan(temp_storage.scan_storage.scan) .ExclusiveSum(selection_flags, @@ -647,7 +647,7 @@ namespace __partition { typename get_plan::type partition_plan = partition_agent::get_plan(stream); int tile_size = partition_plan.items_per_tile; - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_storage = core::vshmem_size(partition_plan.shared_memory_size, num_tiles); @@ -662,7 +662,7 @@ namespace __partition { void* allocations[2] = {NULL, NULL}; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h index a238baf21..43c85bd0b 100644 --- a/thrust/system/cuda/detail/reduce.h +++ b/thrust/system/cuda/detail/reduce.h @@ -76,9 +76,9 @@ namespace __reduce { template + cub::BlockReduceAlgorithm _BLOCK_ALGORITHM = cub::BLOCK_REDUCE_RAKING, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_DEFAULT, + cub::GridMappingStrategy _GRID_MAPPING = cub::GRID_MAPPING_DYNAMIC> struct PtxPolicy { enum @@ -89,9 +89,9 @@ namespace __reduce { ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD }; - static const CUB_NS_QUALIFIER::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; + static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; }; // struct PtxPolicy template @@ -111,9 +111,9 @@ namespace __reduce { typedef PtxPolicy<256, CUB_MAX(1, 20 / SCALE_FACTOR_4B), 2, - CUB_NS_QUALIFIER::BLOCK_REDUCE_WARP_REDUCTIONS, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::GRID_MAPPING_RAKE> + cub::BLOCK_REDUCE_WARP_REDUCTIONS, + cub::LOAD_DEFAULT, + cub::GRID_MAPPING_RAKE> type; }; // Tuning sm30 @@ -124,18 +124,18 @@ namespace __reduce { typedef PtxPolicy<128, CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B), 4, - CUB_NS_QUALIFIER::BLOCK_REDUCE_WARP_REDUCTIONS, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::GRID_MAPPING_DYNAMIC> + cub::BLOCK_REDUCE_WARP_REDUCTIONS, + cub::LOAD_LDG, + cub::GRID_MAPPING_DYNAMIC> ReducePolicy1B; // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items) typedef PtxPolicy<256, CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B), 4, - CUB_NS_QUALIFIER::BLOCK_REDUCE_WARP_REDUCTIONS, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::GRID_MAPPING_DYNAMIC> + cub::BLOCK_REDUCE_WARP_REDUCTIONS, + cub::LOAD_LDG, + cub::GRID_MAPPING_DYNAMIC> ReducePolicy4B; typedef typename thrust::detail::conditional<(sizeof(T) < 4), @@ -161,9 +161,9 @@ namespace __reduce { // typedef Tuning tuning; - typedef typename CUB_NS_QUALIFIER::CubVector Vector; + typedef typename cub::CubVector Vector; typedef typename core::LoadIterator::type LoadIt; - typedef CUB_NS_QUALIFIER::BlockReduce BlockReduce; - typedef CUB_NS_QUALIFIER::CacheModifiedInputIterator VectorLoadIt; @@ -194,7 +194,7 @@ namespace __reduce { // struct Plan : core::AgentPlan { - CUB_NS_QUALIFIER::GridMappingStrategy grid_mapping; + cub::GridMappingStrategy grid_mapping; template THRUST_RUNTIME_FUNCTION @@ -297,14 +297,14 @@ namespace __reduce { T items[ITEMS_PER_THREAD]; // Load items in striped fashion - CUB_NS_QUALIFIER::LoadDirectStriped(threadIdx.x, + cub::LoadDirectStriped(threadIdx.x, load_it + block_offset, items); // Reduce items within each thread stripe thread_aggregate = - (IS_FIRST_TILE) ? CUB_NS_QUALIFIER::internal::ThreadReduce(items, reduction_op) - : CUB_NS_QUALIFIER::internal::ThreadReduce(items, reduction_op, + (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op) + : cub::internal::ThreadReduce(items, reduction_op, thread_aggregate); } @@ -343,8 +343,8 @@ namespace __reduce { // Reduce items within each thread stripe thread_aggregate = - (IS_FIRST_TILE) ? CUB_NS_QUALIFIER::internal::ThreadReduce(items, reduction_op) - : CUB_NS_QUALIFIER::internal::ThreadReduce(items, reduction_op, + (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op) + : cub::internal::ThreadReduce(items, reduction_op, thread_aggregate); } @@ -460,9 +460,9 @@ namespace __reduce { // THRUST_DEVICE_FUNCTION T consume_tiles(Size /*num_items*/, - CUB_NS_QUALIFIER::GridEvenShare &even_share, - CUB_NS_QUALIFIER::GridQueue & /*queue*/, - thrust::detail::integral_constant /*is_rake*/) + cub::GridEvenShare &even_share, + cub::GridQueue & /*queue*/, + thrust::detail::integral_constant /*is_rake*/) { typedef is_true attempt_vec; typedef is_true path_a; @@ -470,7 +470,7 @@ namespace __reduce { // Initialize even-share descriptor for this thread block even_share - .template BlockInit(); + .template BlockInit(); return is_aligned(input_it, attempt_vec()) ? consume_range_impl(even_share.block_offset, @@ -491,7 +491,7 @@ namespace __reduce { template THRUST_DEVICE_FUNCTION T consume_tiles_impl(Size num_items, - CUB_NS_QUALIFIER::GridQueue queue, + cub::GridQueue queue, CAN_VECTORIZE can_vectorize) { using core::sync_threadblock; @@ -578,9 +578,9 @@ namespace __reduce { THRUST_DEVICE_FUNCTION T consume_tiles( Size num_items, - CUB_NS_QUALIFIER::GridEvenShare &/*even_share*/, - CUB_NS_QUALIFIER::GridQueue & queue, - thrust::detail::integral_constant) + cub::GridEvenShare &/*even_share*/, + cub::GridQueue & queue, + thrust::detail::integral_constant) { typedef is_true attempt_vec; typedef is_true path_a; @@ -646,14 +646,14 @@ namespace __reduce { THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, - CUB_NS_QUALIFIER::GridEvenShare even_share, - CUB_NS_QUALIFIER::GridQueue queue, + cub::GridEvenShare even_share, + cub::GridQueue queue, ReductionOp reduction_op, char * shmem) { TempStorage& storage = *reinterpret_cast(shmem); - typedef thrust::detail::integral_constant grid_mapping; + typedef thrust::detail::integral_constant grid_mapping; T block_aggregate = impl(storage, input_it, reduction_op) @@ -677,7 +677,7 @@ namespace __reduce { // Agent entry point //--------------------------------------------------------------------- - THRUST_AGENT_ENTRY(CUB_NS_QUALIFIER::GridQueue grid_queue, + THRUST_AGENT_ENTRY(cub::GridQueue grid_queue, Size num_items, char * /*shmem*/) { @@ -749,8 +749,8 @@ namespace __reduce { template get_max_blocks_per_sm, - CUB_NS_QUALIFIER::GridQueue, + cub::GridEvenShare, + cub::GridQueue, ReductionOp>(reduce_plan); CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status()); @@ -761,7 +761,7 @@ namespace __reduce { int sm_oversubscription = 5; int max_blocks = reduce_device_occupancy * sm_oversubscription; - CUB_NS_QUALIFIER::GridEvenShare even_share; + cub::GridEvenShare even_share; even_share.DispatchInit(static_cast(num_items), max_blocks, reduce_plan.items_per_tile); @@ -776,10 +776,10 @@ namespace __reduce { size_t allocation_sizes[3] = { max_blocks * sizeof(T), // bytes needed for privatized block reductions - CUB_NS_QUALIFIER::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 + cub::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 vshmem_size // size of virtualized shared memory storage }; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); @@ -790,21 +790,21 @@ namespace __reduce { } T *d_block_reductions = (T*) allocations[0]; - CUB_NS_QUALIFIER::GridQueue queue(allocations[1]); + cub::GridQueue queue(allocations[1]); char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL; // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = 0; - if (reduce_plan.grid_mapping == CUB_NS_QUALIFIER::GRID_MAPPING_RAKE) + if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE) { // Work is distributed evenly reduce_grid_size = even_share.grid_size; } - else if (reduce_plan.grid_mapping == CUB_NS_QUALIFIER::GRID_MAPPING_DYNAMIC) + else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC) { // Work is distributed dynamically - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, reduce_plan.items_per_tile); + size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile); // if not enough to fill the device with threadblocks // then fill the device with threadblocks @@ -944,8 +944,8 @@ T reduce_n_impl(execution_policy& policy, size_t tmp_size = 0; THRUST_INDEX_TYPE_DISPATCH2(status, - CUB_NS_QUALIFIER::DeviceReduce::Reduce, - (CUB_NS_QUALIFIER::DispatchReduce< + cub::DeviceReduce::Reduce, + (cub::DispatchReduce< InputIt, T*, Size, BinaryOp >::Dispatch), num_items, @@ -972,8 +972,8 @@ T reduce_n_impl(execution_policy& policy, T* ret_ptr = thrust::detail::aligned_reinterpret_cast(tmp.data().get()); void* tmp_ptr = static_cast((tmp.data() + sizeof(T)).get()); THRUST_INDEX_TYPE_DISPATCH2(status, - CUB_NS_QUALIFIER::DeviceReduce::Reduce, - (CUB_NS_QUALIFIER::DispatchReduce< + cub::DeviceReduce::Reduce, + (cub::DispatchReduce< InputIt, T*, Size, BinaryOp >::Dispatch), num_items, diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h index 53e039e3e..28c733152 100644 --- a/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/system/cuda/detail/reduce_by_key.h @@ -79,9 +79,9 @@ namespace __reduce_by_key { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_DEFAULT, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -91,9 +91,9 @@ namespace __reduce_by_key { ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // struct PtxPolicy template @@ -122,9 +122,9 @@ namespace __reduce_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning sm30 @@ -151,9 +151,9 @@ namespace __reduce_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning sm35 @@ -180,9 +180,9 @@ namespace __reduce_by_key { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning sm52 @@ -200,11 +200,11 @@ namespace __reduce_by_key { typedef typename iterator_traits::value_type value_type; typedef Size size_type; - typedef CUB_NS_QUALIFIER::KeyValuePair size_value_pair_t; - typedef CUB_NS_QUALIFIER::KeyValuePair key_value_pair_t; + typedef cub::KeyValuePair size_value_pair_t; + typedef cub::KeyValuePair key_value_pair_t; - typedef CUB_NS_QUALIFIER::ReduceByKeyScanTileState ScanTileState; - typedef CUB_NS_QUALIFIER::ReduceBySegmentOp ReduceBySegmentOp; + typedef cub::ReduceByKeyScanTileState ScanTileState; + typedef cub::ReduceBySegmentOp ReduceBySegmentOp; template struct PtxPlan : Tuning::type @@ -217,19 +217,19 @@ namespace __reduce_by_key { typedef typename core::BlockLoad::type BlockLoadKeys; typedef typename core::BlockLoad::type BlockLoadValues; - typedef CUB_NS_QUALIFIER::BlockDiscontinuity BlockDiscontinuityKeys; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan inequality_op; + cub::InequalityWrapper inequality_op; ReduceBySegmentOp scan_op; //--------------------------------------------------------------------- @@ -911,7 +911,7 @@ namespace __reduce_by_key { // Number of input tiles int tile_size = reduce_by_key_plan.items_per_tile; - Size num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size, num_tiles); @@ -921,7 +921,7 @@ namespace __reduce_by_key { CUDA_CUB_RET_IF_FAIL(status); void *allocations[2] = {NULL, NULL}; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); diff --git a/thrust/system/cuda/detail/scan.h b/thrust/system/cuda/detail/scan.h index 28aa98699..4f9628319 100644 --- a/thrust/system/cuda/detail/scan.h +++ b/thrust/system/cuda/detail/scan.h @@ -59,16 +59,16 @@ OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy &poli OutputIt result, ScanOp scan_op) { - using Dispatch32 = CUB_NS_QUALIFIER::DispatchScan; - using Dispatch64 = CUB_NS_QUALIFIER::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; @@ -85,7 +85,7 @@ OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy &poli first, result, scan_op, - CUB_NS_QUALIFIER::NullType{}, + cub::NullType{}, num_items_fixed, stream, THRUST_DEBUG_SYNC_FLAG)); @@ -109,7 +109,7 @@ OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy &poli first, result, scan_op, - CUB_NS_QUALIFIER::NullType{}, + cub::NullType{}, num_items_fixed, stream, THRUST_DEBUG_SYNC_FLAG)); @@ -137,16 +137,16 @@ OutputIt exclusive_scan_n_impl(thrust::cuda_cub::execution_policy &poli InitValueT init, ScanOp scan_op) { - using Dispatch32 = CUB_NS_QUALIFIER::DispatchScan; - using Dispatch64 = CUB_NS_QUALIFIER::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h index 2bbe8b189..ebe25c3ed 100644 --- a/thrust/system/cuda/detail/scan_by_key.h +++ b/thrust/system/cuda/detail/scan_by_key.h @@ -50,10 +50,10 @@ namespace __scan_by_key { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_DEFAULT, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS, + cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct PtxPolicy { enum @@ -63,10 +63,10 @@ namespace __scan_by_key { ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; - static const CUB_NS_QUALIFIER::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; // struct PtxPolicy template @@ -95,10 +95,10 @@ namespace __scan_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm30 @@ -125,10 +125,10 @@ namespace __scan_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm35 @@ -155,10 +155,10 @@ namespace __scan_by_key { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; // Tuning sm52 @@ -177,11 +177,11 @@ namespace __scan_by_key { typedef T value_type; typedef Size size_type; - typedef CUB_NS_QUALIFIER::KeyValuePair size_value_pair_t; - typedef CUB_NS_QUALIFIER::KeyValuePair key_value_pair_t; + typedef cub::KeyValuePair size_value_pair_t; + typedef cub::KeyValuePair key_value_pair_t; - typedef CUB_NS_QUALIFIER::ReduceByKeyScanTileState ScanTileState; - typedef CUB_NS_QUALIFIER::ReduceBySegmentOp ReduceBySegmentOp; + typedef cub::ReduceByKeyScanTileState ScanTileState; + typedef cub::ReduceBySegmentOp ReduceBySegmentOp; template struct PtxPlan : Tuning::type @@ -198,19 +198,19 @@ namespace __scan_by_key { ValuesOutputIt, value_type>::type BlockStoreValues; - typedef CUB_NS_QUALIFIER::BlockDiscontinuity BlockDiscontinuityKeys; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan inequality_op; + cub::InequalityWrapper inequality_op; ReduceBySegmentOp scan_op; @@ -673,7 +673,7 @@ namespace __scan_by_key { AgentPlan init_plan = init_agent::get_plan(); int tile_size = scan_by_key_plan.items_per_tile; - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_size = core::vshmem_size(scan_by_key_plan.shared_memory_size, num_tiles); @@ -683,7 +683,7 @@ namespace __scan_by_key { CUDA_CUB_RET_IF_FAIL(status); void *allocations[2] = {NULL, NULL}; - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); diff --git a/thrust/system/cuda/detail/set_operations.h b/thrust/system/cuda/detail/set_operations.h index 34cc02a16..ade55c41b 100644 --- a/thrust/system/cuda/detail/set_operations.h +++ b/thrust/system/cuda/detail/set_operations.h @@ -203,9 +203,9 @@ namespace __set_operations { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -215,9 +215,9 @@ namespace __set_operations { ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD - 1 }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // PtxPolicy template @@ -246,9 +246,9 @@ namespace __set_operations { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // tuning sm30 @@ -273,9 +273,9 @@ namespace __set_operations { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // tuning sm52 @@ -300,9 +300,9 @@ namespace __set_operations { typedef PtxPolicy<512, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // tuning sm60 @@ -326,7 +326,7 @@ namespace __set_operations { typedef key1_type key_type; typedef value1_type value_type; - typedef CUB_NS_QUALIFIER::ScanTileState ScanTileState; + typedef cub::ScanTileState ScanTileState; template struct PtxPlan : Tuning::type @@ -343,13 +343,13 @@ namespace __set_operations { typedef typename core::BlockLoad::type BlockLoadValues1; typedef typename core::BlockLoad::type BlockLoadValues2; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct PtxPolicy { enum @@ -144,9 +144,9 @@ namespace __merge_sort { ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; // PtxPolicy @@ -166,9 +166,9 @@ namespace __merge_sort { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; @@ -185,9 +185,9 @@ namespace __merge_sort { typedef PtxPolicy<512, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; @@ -204,9 +204,9 @@ namespace __merge_sort { typedef PtxPolicy<256, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; @@ -221,9 +221,9 @@ namespace __merge_sort { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_STORE_WARP_TRANSPOSE> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_STORE_WARP_TRANSPOSE> type; }; @@ -1335,13 +1335,13 @@ namespace __radix_sort { THRUST_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, - CUB_NS_QUALIFIER::DoubleBuffer& keys_buffer, - CUB_NS_QUALIFIER::DoubleBuffer& /*items_buffer*/, + cub::DoubleBuffer& keys_buffer, + cub::DoubleBuffer& /*items_buffer*/, Size count, cudaStream_t stream, bool debug_sync) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys(d_temp_storage, + return cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys_buffer, static_cast(count), @@ -1360,13 +1360,13 @@ namespace __radix_sort { THRUST_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, - CUB_NS_QUALIFIER::DoubleBuffer& keys_buffer, - CUB_NS_QUALIFIER::DoubleBuffer& /*items_buffer*/, + cub::DoubleBuffer& keys_buffer, + cub::DoubleBuffer& /*items_buffer*/, Size count, cudaStream_t stream, bool debug_sync) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortKeysDescending(d_temp_storage, + return cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys_buffer, static_cast(count), @@ -1385,13 +1385,13 @@ namespace __radix_sort { THRUST_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, - CUB_NS_QUALIFIER::DoubleBuffer& keys_buffer, - CUB_NS_QUALIFIER::DoubleBuffer& items_buffer, + cub::DoubleBuffer& keys_buffer, + cub::DoubleBuffer& items_buffer, Size count, cudaStream_t stream, bool debug_sync) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortPairs(d_temp_storage, + return cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys_buffer, items_buffer, @@ -1411,13 +1411,13 @@ namespace __radix_sort { THRUST_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, - CUB_NS_QUALIFIER::DoubleBuffer& keys_buffer, - CUB_NS_QUALIFIER::DoubleBuffer& items_buffer, + cub::DoubleBuffer& keys_buffer, + cub::DoubleBuffer& items_buffer, Size count, cudaStream_t stream, bool debug_sync) { - return CUB_NS_QUALIFIER::DeviceRadixSort::SortPairsDescending(d_temp_storage, + return cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, keys_buffer, items_buffer, @@ -1446,8 +1446,8 @@ namespace __radix_sort { cudaStream_t stream = cuda_cub::stream(policy); bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - CUB_NS_QUALIFIER::DoubleBuffer keys_buffer(keys, NULL); - CUB_NS_QUALIFIER::DoubleBuffer items_buffer(items, NULL); + cub::DoubleBuffer keys_buffer(keys, NULL); + cub::DoubleBuffer items_buffer(items, NULL); Size keys_count = count; Size items_count = SORT_ITEMS::value ? count : 0; diff --git a/thrust/system/cuda/detail/unique.h b/thrust/system/cuda/detail/unique.h index a0e7ca0aa..d0262ff57 100644 --- a/thrust/system/cuda/detail/unique.h +++ b/thrust/system/cuda/detail/unique.h @@ -78,9 +78,9 @@ namespace __unique { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -89,9 +89,9 @@ namespace __unique { ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // struct PtxPolicy template @@ -128,9 +128,9 @@ namespace __unique { typedef PtxPolicy<64, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm52 @@ -149,9 +149,9 @@ namespace __unique { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm35 @@ -169,9 +169,9 @@ namespace __unique { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm30 @@ -184,7 +184,7 @@ namespace __unique { { typedef typename iterator_traits::value_type item_type; - typedef CUB_NS_QUALIFIER::ScanTileState ScanTileState; + typedef cub::ScanTileState ScanTileState; template struct PtxPlan : Tuning::type @@ -195,19 +195,19 @@ namespace __unique { typedef typename core::BlockLoad::type BlockLoadItems; - typedef CUB_NS_QUALIFIER::BlockDiscontinuity BlockDiscontinuityItems; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan predicate; + cub::InequalityWrapper predicate; Size num_items; //--------------------------------------------------------------------- @@ -393,7 +393,7 @@ namespace __unique { { TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, - CUB_NS_QUALIFIER::Sum(), + cub::Sum(), tile_idx); BlockScan(temp_storage.scan_storage.scan) .ExclusiveSum(selection_flags, @@ -580,7 +580,7 @@ namespace __unique { int tile_size = unique_plan.items_per_tile; - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles); @@ -592,7 +592,7 @@ namespace __unique { void *allocations[2] = {NULL, NULL}; // - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); diff --git a/thrust/system/cuda/detail/unique_by_key.h b/thrust/system/cuda/detail/unique_by_key.h index 7df41f3ca..e5a1c3ee7 100644 --- a/thrust/system/cuda/detail/unique_by_key.h +++ b/thrust/system/cuda/detail/unique_by_key.h @@ -82,9 +82,9 @@ namespace __unique_by_key { template + cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, + cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, + cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS> struct PtxPolicy { enum @@ -93,9 +93,9 @@ namespace __unique_by_key { ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD, }; - static const CUB_NS_QUALIFIER::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const CUB_NS_QUALIFIER::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const CUB_NS_QUALIFIER::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; // struct PtxPolicy template @@ -133,9 +133,9 @@ namespace __unique_by_key { typedef PtxPolicy<64, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm52 @@ -153,9 +153,9 @@ namespace __unique_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_LDG, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm35 @@ -173,9 +173,9 @@ namespace __unique_by_key { typedef PtxPolicy<128, ITEMS_PER_THREAD, - CUB_NS_QUALIFIER::BLOCK_LOAD_WARP_TRANSPOSE, - CUB_NS_QUALIFIER::LOAD_DEFAULT, - CUB_NS_QUALIFIER::BLOCK_SCAN_WARP_SCANS> + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_DEFAULT, + cub::BLOCK_SCAN_WARP_SCANS> type; }; // Tuning for sm30 @@ -191,7 +191,7 @@ namespace __unique_by_key { typedef typename iterator_traits::value_type key_type; typedef typename iterator_traits::value_type value_type; - typedef CUB_NS_QUALIFIER::ScanTileState ScanTileState; + typedef cub::ScanTileState ScanTileState; template struct PtxPlan : Tuning::type @@ -204,19 +204,19 @@ namespace __unique_by_key { typedef typename core::BlockLoad::type BlockLoadKeys; typedef typename core::BlockLoad::type BlockLoadValues; - typedef CUB_NS_QUALIFIER::BlockDiscontinuity BlockDiscontinuityKeys; - typedef CUB_NS_QUALIFIER::TilePrefixCallbackOp TilePrefixCallback; - typedef CUB_NS_QUALIFIER::BlockScan predicate; + cub::InequalityWrapper predicate; Size num_items; //--------------------------------------------------------------------- @@ -443,7 +443,7 @@ namespace __unique_by_key { { TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, - CUB_NS_QUALIFIER::Sum(), + cub::Sum(), tile_idx); BlockScan(temp_storage.scan_storage.scan) .ExclusiveSum(selection_flags, @@ -662,7 +662,7 @@ namespace __unique_by_key { int tile_size = unique_plan.items_per_tile; - size_t num_tiles = CUB_NS_QUALIFIER::DivideAndRoundUp(num_items, tile_size); + size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size); size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size, num_tiles); @@ -674,7 +674,7 @@ namespace __unique_by_key { void *allocations[2] = {NULL, NULL}; // - status = CUB_NS_QUALIFIER::AliasTemporaries(d_temp_storage, + status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);