diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c0caebcea..76984837e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -81,6 +81,25 @@ variables: paths: - $CMAKE_LATEST_PATH +.deps-cpu:cmake-latest: + stage: build:cmake_latest + before_script: + - $SUDO_CMD apt update -qq + - $SUDO_CMD apt install -y -qq apt-transport-https software-properties-common + - $SUDO_CMD add-apt-repository ppa:ubuntu-toolchain-r/test + # | Used in the script | Build tools | Fetch from https:// | rocminfo calls lsmod + - $SUDO_CMD apt install -y -qq wget tar xz-utils bzip2 libnuma-dev libunwind-dev git build-essential pkg-config ninja-build ca-certificates kmod g++-9 + # Fetch CMake only if the cache has not been restored + - if [ ! -d $CMAKE_LATEST_PATH ]; then mkdir -p $CMAKE_LATEST_PATH; wget --no-check-certificate --quiet -O - $CMAKE_LATEST_URL | tar --strip-components=1 -xz -C $CMAKE_LATEST_PATH; + - fi; + - export PATH=$CMAKE_LATEST_PATH/bin:$PATH + # Debug printing of environment for context when errors occur + - hipconfig + cache: + key: $CMAKE_LATEST_VERSION + paths: + - $CMAKE_LATEST_PATH + build:cmake-latest: extends: - .deps:cmake-latest @@ -92,6 +111,7 @@ build:cmake-latest: - cmake -G Ninja -D CMAKE_CXX_COMPILER=hipcc + -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON @@ -112,6 +132,31 @@ build:cmake-latest: - $BUILD_LATEST_DIR/CTestTestfile.cmake expire_in: 2 weeks +# TODO: Enable the hip-cpu CI step +#build-cpu:cmake-latest: +# extends: +# - .deps-cpu:cmake-latest +# tags: +# - s9300 +# - rocm +# script: +# - mkdir -p $BUILD_LATEST_DIR +# - cd $BUILD_LATEST_DIR +# - cmake +# -G Ninja +# -D CMAKE_CXX_COMPILER=g++-9 +# -D CMAKE_CXX_FLAGS="-Wall -Wextra" +# -D CMAKE_BUILD_TYPE=Release +# -D BUILD_TEST=ON +# -D BUILD_EXAMPLE=OFF +# -D BUILD_BENCHMARK=OFF +# -D USE_HIP_CPU=ON +# -S $CI_PROJECT_DIR +# -B $BUILD_LATEST_DIR +# - cmake +# --build $BUILD_LATEST_DIR +# --parallel 3 + build:cmake-minimum: extends: - .deps:cmake-minimum @@ -126,6 +171,7 @@ build:cmake-minimum: - cmake -G Ninja -D CMAKE_CXX_COMPILER=hipcc + -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b7197f82..e83746b71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ Full documentation for rocPRIM is available at [https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/](https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/) +## [Unreleased rocPRIM-Next] +### Added +- Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Win/Linux. It is work in progress, many algorithms still known to fail. + ## [Unreleased rocPRIM-2.10.11 for ROCm 4.4.0] ### Added - Code coverage tools build option diff --git a/CMakeLists.txt b/CMakeLists.txt index f0c3dae44..ca6490c0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,14 @@ list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ) # rocPRIM project project(rocprim LANGUAGES CXX) +# Build options +option(BUILD_TEST "Build tests (requires googletest)" OFF) +option(BUILD_BENCHMARK "Build benchmarks" OFF) +option(BUILD_EXAMPLE "Build examples" OFF) +option(USE_HIP_CPU "Prefer HIP-CPU runtime instead of HW acceleration" OFF) +# Disables building tests, benchmarks, examples +option(ONLY_INSTALL "Only install" OFF) + # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake @@ -46,36 +54,35 @@ endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") -# Get dependencies -include(cmake/Dependencies.cmake) - -# Set the AMDGPU_TARGETS with backward compatiblity -# Use target ID syntax if supported for AMDGPU_TARGETS -if(COMMAND rocm_check_target_ids) - rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030" - ) -else() - # Detect compiler support for target ID - # This section is deprecated. Please use rocm_check_target_ids for future use. - if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help" - OUTPUT_VARIABLE CXX_OUTPUT - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE) - string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT}) - endif() - if(TARGET_ID_SUPPORT) - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-") +if(NOT USE_HIP_CPU) + # Set the AMDGPU_TARGETS with backward compatiblity + # Use target ID syntax if supported for AMDGPU_TARGETS + if(COMMAND rocm_check_target_ids) + rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030" + ) else() - set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908") + # Detect compiler support for target ID + # This section is deprecated. Please use rocm_check_target_ids for future use. + if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help" + OUTPUT_VARIABLE CXX_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_STRIP_TRAILING_WHITESPACE) + string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT}) + endif() + if(TARGET_ID_SUPPORT) + set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030") + else() + set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908;gfx1030") + endif() endif() -endif() -set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") -set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device + set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") + set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device -# Verify that hcc compiler is used on ROCM platform -include(cmake/VerifyCompiler.cmake) + # Verify that hcc compiler is used on ROCM platform + include(cmake/VerifyCompiler.cmake) +endif() # Build options # Disable -Werror @@ -98,11 +105,8 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -if(DISABLE_WERROR) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") -endif() +# Get dependencies +include(cmake/Dependencies.cmake) # Setup VERSION set(VERSION_STRING "2.10.9") diff --git a/README.md b/README.md index 8409a0894..3e098f0c3 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,11 @@ cd rocPRIM; mkdir build; cd build # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # Using HIP-clang: [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../. +# +# ! EXPERIMENTAL ! +# Alternatively one may build using the experimental (and highly incomplete) HIP-CPU back-end for host-side +# execution using any C++17 conforming compiler (supported by HIP-CPU). AMDGPU_* options are unavailable in this case. +# USE_HIP_CPU - OFF by default # Build make -j4 diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 95b60c539..9f289745c 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -24,18 +24,40 @@ option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various c function(add_rocprim_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) + add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) + target_link_libraries(${BENCHMARK_TARGET} PRIVATE - rocprim_hip + rocprim benchmark::benchmark ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) + if(NOT USE_HIP_CPU) + target_link_libraries(${BENCHMARK_TARGET} + PRIVATE + rocprim_hip + ) + else() target_link_libraries(${BENCHMARK_TARGET} PRIVATE - --amdgpu-target=${amdgpu_target} + Threads::Threads + hip_cpu_rt::hip_cpu_rt ) - endforeach() + if(STL_DEPENDS_ON_TBB) + target_link_libraries(${BENCHMARK_TARGET} + PRIVATE + TBB::tbb + ) + endif() + endif() + + target_compile_options(${BENCHMARK_TARGET} + PRIVATE + $<$: + /bigobj # number of sections exceeded object file format limit: compile with /bigobj + > + ) + set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp index 627123b21..fc7b4ea22 100644 --- a/benchmark/benchmark_block_discontinuity.cpp +++ b/benchmark/benchmark_block_discontinuity.cpp @@ -82,13 +82,13 @@ struct flag_heads __device__ static void run(const T * d_input, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; @@ -125,13 +125,13 @@ struct flag_tails __device__ static void run(const T * d_input, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; @@ -168,13 +168,13 @@ struct flag_heads_and_tails __device__ static void run(const T * d_input, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; @@ -217,8 +217,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::vector input = get_random_data(size, T(0), T(10)); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp index b8c30684c..2cdc964f1 100644 --- a/benchmark/benchmark_block_exchange.cpp +++ b/benchmark/benchmark_block_exchange.cpp @@ -80,13 +80,13 @@ struct blocked_to_striped __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -109,13 +109,13 @@ struct striped_to_blocked __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -138,13 +138,13 @@ struct blocked_to_warp_striped __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -167,13 +167,13 @@ struct warp_striped_to_blocked __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -196,15 +196,15 @@ struct scatter_to_blocked __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -227,15 +227,15 @@ struct scatter_to_striped __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; @@ -277,9 +277,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) T * d_input; unsigned int * d_ranks; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_ranks), size * sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp index 46bcfe5d5..8bc49aaf6 100644 --- a/benchmark/benchmark_block_histogram.cpp +++ b/benchmark/benchmark_block_histogram.cpp @@ -83,8 +83,9 @@ struct histogram __device__ static void run(const T* input, T* output) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; + // TODO: Move global_offset into final loop + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + unsigned int global_offset = blockIdx.x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) @@ -96,18 +97,18 @@ struct histogram __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t().histogram(values, histogram, storage); } - #pragma unroll + ROCPRIM_UNROLL for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { - if(offset + hipThreadIdx_x < BinSize) + if(offset + threadIdx.x < BinSize) { - output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x]; + output[global_offset + threadIdx.x] = histogram[offset + threadIdx.x]; global_offset += BlockSize; } } @@ -132,8 +133,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::vector input(size, 0.0f); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), bin_size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp index 09707df88..992a3f2b5 100644 --- a/benchmark/benchmark_block_radix_sort.cpp +++ b/benchmark/benchmark_block_radix_sort.cpp @@ -71,13 +71,13 @@ __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T * input, T * output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; @@ -97,8 +97,8 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T * input, T * output) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; @@ -108,7 +108,7 @@ void sort_pairs_kernel(const T * input, T * output) values[i] = keys[i] + T(1); } - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; @@ -148,8 +148,8 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS } T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp index 3ea932a62..1bc7d414f 100644 --- a/benchmark/benchmark_block_reduce.cpp +++ b/benchmark/benchmark_block_reduce.cpp @@ -81,7 +81,7 @@ struct reduce __device__ static void run(const T* input, T* output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; T reduced_value; @@ -93,16 +93,16 @@ struct reduce using breduce_t = rp::block_reduce; __shared__ typename breduce_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { breduce_t().reduce(values, reduced_value, storage); values[0] = reduced_value; } - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - output[hipBlockIdx_x] = reduced_value; + output[blockIdx.x] = reduced_value; } } }; @@ -123,8 +123,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::vector input(size, T(1)); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp index 4e150055a..ef95bc5ed 100644 --- a/benchmark/benchmark_block_scan.cpp +++ b/benchmark/benchmark_block_scan.cpp @@ -81,7 +81,7 @@ struct inclusive_scan __device__ static void run(const T* input, T* output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) @@ -92,7 +92,7 @@ struct inclusive_scan using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().inclusive_scan(values, values, storage); @@ -118,7 +118,7 @@ struct exclusive_scan __device__ static void run(const T* input, T* output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; using U = typename std::remove_reference::type; T values[ItemsPerThread]; @@ -132,7 +132,7 @@ struct exclusive_scan using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().exclusive_scan(values, values, init, storage); @@ -162,8 +162,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) std::vector input(size, T(1)); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_block_sort.cpp b/benchmark/benchmark_block_sort.cpp index 8b7108e1f..e56faaa25 100644 --- a/benchmark/benchmark_block_sort.cpp +++ b/benchmark/benchmark_block_sort.cpp @@ -70,11 +70,11 @@ __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T * input, T * output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T key = input[index]; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_sort bsort; @@ -93,12 +93,12 @@ __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T * input, T * output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T key = input[index]; T value = key + T(1); - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_sort bsort; @@ -115,7 +115,7 @@ template< > void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { - constexpr auto block_size = BlockSize; + static constexpr auto block_size = BlockSize; const auto size = block_size * ((N + block_size - 1)/block_size); std::vector input; @@ -133,8 +133,8 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS } T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -151,16 +151,16 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_keys_kernel), - dim3(size/block_size), dim3(BlockSize), 0, stream, + HIP_KERNEL_NAME(sort_keys_kernel), + dim3(size/block_size), dim3(block_size), 0, stream, d_input, d_output ); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(sort_pairs_kernel), - dim3(size/block_size), dim3(BlockSize), 0, stream, + HIP_KERNEL_NAME(sort_pairs_kernel), + dim3(size/block_size), dim3(block_size), 0, stream, d_input, d_output ); } diff --git a/benchmark/benchmark_device_binary_search.cpp b/benchmark/benchmark_device_binary_search.cpp index 0cce76b18..16291f317 100644 --- a/benchmark/benchmark_device_binary_search.cpp +++ b/benchmark/benchmark_device_binary_search.cpp @@ -82,9 +82,9 @@ void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream, haystack_type * d_haystack; needle_type * d_needles; output_type * d_output; - HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type))); - HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type))); - HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_haystack), haystack_size * sizeof(haystack_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_needles), needles_size * sizeof(needle_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), needles_size * sizeof(output_type))); HIP_CHECK( hipMemcpy( d_haystack, haystack.data(), diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp index be2bba4d8..fa50c53a2 100644 --- a/benchmark/benchmark_device_histogram.cpp +++ b/benchmark/benchmark_device_histogram.cpp @@ -62,7 +62,7 @@ std::vector generate(size_t size, int entropy_reduction, int lower_level, int { if(entropy_reduction >= 5) { - return std::vector(size, (lower_level + upper_level) / 2); + return std::vector(size, (T)((lower_level + upper_level) / 2)); } const size_t max_random_size = 1024 * 1024; @@ -125,8 +125,8 @@ void run_even_benchmark(benchmark::State& state, T * d_input; counter_type * d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_histogram), size * sizeof(counter_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -221,10 +221,10 @@ void run_multi_even_benchmark(benchmark::State& state, T * d_input; counter_type * d_histogram[ActiveChannels]; - HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { - HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_histogram[channel]), bins * sizeof(counter_type))); } HIP_CHECK( hipMemcpy( @@ -312,9 +312,9 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea T * d_input; T * d_levels; counter_type * d_histogram; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); - HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_levels), (bins + 1) * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_histogram), size * sizeof(counter_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp index 74edd49fe..2e0edae6d 100644 --- a/benchmark/benchmark_device_memory.cpp +++ b/benchmark/benchmark_device_memory.cpp @@ -46,7 +46,6 @@ enum memory_operation_method { - memcpy, block_primitives_transpose, striped, vectorized, @@ -96,11 +95,11 @@ struct operation (void) shared_storage; (void) shared_storage_size; (void) global_mem_output; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); @@ -153,9 +152,9 @@ struct operation (void) shared_storage; (void) shared_storage_size; (void) input; - unsigned int index = hipThreadIdx_x * ItemsPerThread + - hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; - #pragma unroll + unsigned int index = threadIdx.x * ItemsPerThread + + blockIdx.x * blockDim.x * ItemsPerThread; + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -175,9 +174,9 @@ struct operation (void) shared_storage; (void) shared_storage_size; (void) input; - unsigned int index = (hipThreadIdx_x % warpSize) * ItemsPerThread + - hipBlockIdx_x * hipBlockDim_x * ItemsPerThread; - #pragma unroll + unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + + blockIdx.x * blockDim.x * ItemsPerThread; + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -197,8 +196,8 @@ struct operation (void) shared_storage; (void) shared_storage_size; (void) input; - unsigned int index = hipThreadIdx_x * ItemsPerThread; - #pragma unroll + unsigned int index = threadIdx.x * ItemsPerThread; + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); @@ -236,7 +235,7 @@ void operation_kernel(T* input, T* output, CustomOp op) typename block_store_type::storage_type store; } storage; - int offset = hipBlockIdx_x * items_per_block; + int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); @@ -260,17 +259,17 @@ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - int offset = hipBlockIdx_x * items_per_block; + int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; rocprim::block_load_direct_blocked_vectorized - (hipThreadIdx_x, input + offset, items); + (threadIdx.x, input + offset, items); __syncthreads(); op(items, nullptr, 0, output); rocprim::block_store_direct_blocked_vectorized - (hipThreadIdx_x, output + offset, items); + (threadIdx.x, output + offset, items); } // striped method base kernel @@ -287,8 +286,8 @@ __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T items[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, items); op(items, nullptr, 0, output); @@ -325,7 +324,7 @@ void operation_kernel(T* input, T* output, CustomOp op) typename block_store_type::storage_type store; } storage; - int offset = hipBlockIdx_x * items_per_block; + int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); @@ -361,8 +360,8 @@ void run_benchmark(benchmark::State& state, } T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -432,8 +431,8 @@ void run_benchmark_memcpy(benchmark::State& state, } T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); // Warm-up for(size_t i = 0; i < 10; i++) { diff --git a/benchmark/benchmark_device_merge.cpp b/benchmark/benchmark_device_merge.cpp index 464c6d104..4c7109935 100644 --- a/benchmark/benchmark_device_merge.cpp +++ b/benchmark/benchmark_device_merge.cpp @@ -78,9 +78,9 @@ void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_ key_type * d_keys_input1; key_type * d_keys_input2; key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input1, keys_input1.data(), @@ -179,12 +179,12 @@ void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size value_type * d_values_input1; value_type * d_values_input2; value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_values_input1, size1 * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_input2, size2 * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input1), size1 * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input2), size2 * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_keys_input1, keys_input1.data(), diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp index c3ca8de70..9d4cd83b4 100644 --- a/benchmark/benchmark_device_merge_sort.cpp +++ b/benchmark/benchmark_device_merge_sort.cpp @@ -80,8 +80,8 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t key_type * d_keys_input; key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), @@ -173,8 +173,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ key_type * d_keys_input; key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), @@ -185,8 +185,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_ value_type * d_values_input; value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp index f9a55ed67..c12710533 100644 --- a/benchmark/benchmark_device_partition.cpp +++ b/benchmark/benchmark_device_partition.cpp @@ -80,10 +80,10 @@ void run_flagged_benchmark(benchmark::State& state, FlagType * d_flags; T * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), flags.size() * sizeof(FlagType))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -186,9 +186,9 @@ void run_if_benchmark(benchmark::State& state, T * d_input; T * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp index 268c4d137..9d678fe59 100644 --- a/benchmark/benchmark_device_radix_sort.cpp +++ b/benchmark/benchmark_device_radix_sort.cpp @@ -80,10 +80,10 @@ void run_sort_keys_benchmark(benchmark::State& state, key_type * d_keys_input; key_type * d_keys_output; - - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - + + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); + HIP_CHECK( hipMemcpy( d_keys_input, keys_input->data(), @@ -167,10 +167,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, key_type * d_keys_input; key_type * d_keys_output; - - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); - + + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); + HIP_CHECK( hipMemcpy( d_keys_input, keys_input->data(), @@ -181,10 +181,10 @@ void run_sort_pairs_benchmark(benchmark::State& state, value_type * d_values_input; value_type * d_values_output; - - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); - + + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); + HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), @@ -259,95 +259,251 @@ void run_sort_pairs_benchmark(benchmark::State& state, #ifdef BENCHMARK_CONFIG_TUNING -#define CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, IPT2) \ -benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_keys") + "<" #Key ", radix_sort_config<" #LRB ", " #SRB ", kernel_config<" #BS1 ", " #IPT1 ">, kernel_config<" #BS2 ", " #IPT2 "> >").c_str(), \ - [=](benchmark::State& state) { run_sort_keys_benchmark, rocprim::kernel_config > >(state, stream, size, keys_input); } \ - ) \ -); - -#define CREATE_SORT_KEYS_BENCHMARK2(Key, LRB, SRB, BS1, IPT1, BS2) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 2) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 3) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 4) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 5) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 6) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 7) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 8) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 9) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 10) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 11) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 12) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 13) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 14) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 15) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 16) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 17) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 18) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 19) \ - CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 20) - -#define CREATE_SORT_KEYS_BENCHMARK3(Key, BS1, IPT1) \ - CREATE_SORT_KEYS_BENCHMARK2(Key, 4, 3, BS1, IPT1, 256) \ - CREATE_SORT_KEYS_BENCHMARK2(Key, 5, 4, BS1, IPT1, 256) \ - CREATE_SORT_KEYS_BENCHMARK2(Key, 6, 4, BS1, IPT1, 256) \ - CREATE_SORT_KEYS_BENCHMARK2(Key, 7, 6, BS1, IPT1, 256) \ - CREATE_SORT_KEYS_BENCHMARK2(Key, 8, 7, BS1, IPT1, 256) +template< + typename Key, typename Value, + unsigned int LRB, unsigned int SRB, + unsigned int BlockSize1, unsigned int ItemsPerThread1, + unsigned int BlockSize2, unsigned int ItemsPerThread2 +> +auto sort_keys_add_benchmark( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< + std::is_same::value, void + >::type +{ + auto keys_input = std::make_shared>(generate_keys(size)); + benchmarks.push_back( + benchmark::RegisterBenchmark( + (std::string("sort_keys") + "<" + typeid(Key).name() + "radix_sort_config<" + + std::to_string(LRB) + ", " + std::to_string(SRB) + ", kernel_config<" + + std::to_string(BlockSize1) + ", " + std::to_string(ItemsPerThread1) + ">, kernel_config<" + + std::to_string(BlockSize2) + ", " + std::to_string(ItemsPerThread2) + "> >").c_str(), + [=](benchmark::State& state) { + run_sort_keys_benchmark< + Key, + rocprim::radix_sort_config< + LRB, + SRB, + rocprim::kernel_config, + rocprim::kernel_config + > + >(state, stream, size, keys_input); + } + ) + ); +} -#define CREATE_SORT_KEYS_BENCHMARK(Key) \ - { \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 1) \ - CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 2) \ - CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 4) \ - CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 8) \ - } +template< + typename Key, typename Value, + unsigned int LRB, unsigned int SRB, + unsigned int BlockSize1, unsigned int ItemsPerThread1, + unsigned int BlockSize2, unsigned int ItemsPerThread2 +> +auto sort_keys_add_benchmark( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< + !std::is_same::value, void + >::type +{ + auto keys_input = std::make_shared>(generate_keys(size)); + benchmarks.push_back( + benchmark::RegisterBenchmark( + (std::string("sort_pairs") + "<" + typeid(Key).name() + "," + typeid(Value).name() + + "radix_sort_config<" + std::to_string(LRB) + ", " + std::to_string(SRB) + ", kernel_config<" + + std::to_string(BlockSize1) + ", " + std::to_string(ItemsPerThread1) + ">, kernel_config<" + + std::to_string(BlockSize2) + ", " + std::to_string(ItemsPerThread2) + "> >").c_str(), + [=](benchmark::State& state) { + run_sort_pairs_benchmark< + Key, Value, + rocprim::radix_sort_config< + LRB, + SRB, + rocprim::kernel_config, + rocprim::kernel_config + > + >(state, stream, size, keys_input); + } + ) + ); +} -#define CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, IPT2) \ -benchmarks.push_back( \ - benchmark::RegisterBenchmark( \ - (std::string("sort_pairs") + "<" #Key ", " #Value ", radix_sort_config<" #LRB ", " #SRB ", kernel_config<" #BS1 ", " #IPT1 ">, kernel_config<" #BS2 ", " #IPT2 "> >").c_str(), \ - [=](benchmark::State& state) { run_sort_pairs_benchmark, rocprim::kernel_config > >(state, stream, size, keys_input); } \ - ) \ -); - -#define CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, LRB, SRB, BS1, IPT1, BS2) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 2) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 3) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 4) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 5) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 6) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 7) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 8) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 9) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 10) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 11) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 12) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 13) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 14) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 15) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 16) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 17) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 18) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 19) \ - CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 20) - -#define CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, BS1, IPT1) \ - CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 4, 3, BS1, IPT1, 256) \ - CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 5, 4, BS1, IPT1, 256) \ - CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 6, 4, BS1, IPT1, 256) \ - CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 7, 6, BS1, IPT1, 256) \ - CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 8, 7, BS1, IPT1, 256) +template< + typename Key, typename Value, + unsigned int LRB, unsigned int SRB, + unsigned int BlockSize1, unsigned int ItemsPerThread1, + unsigned int BlockSize2, unsigned int ItemsPerThread2, + unsigned int MaxItemsPerThread1 +> +auto sort_keys_benchmark_generate_ipt1_grid( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< ItemsPerThread1 == MaxItemsPerThread1, void>::type +{ + sort_keys_add_benchmark< + Key, Value, LRB, SRB, + BlockSize1, ItemsPerThread1, + BlockSize2, ItemsPerThread2 + >(benchmarks, stream, size); +} -#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ - { \ - auto keys_input = std::make_shared>(generate_keys(size)); \ - CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 1) \ - CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 2) \ - CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 4) \ - CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 8) \ - } +template< + typename Key, typename Value, + unsigned int LRB, unsigned int SRB, + unsigned int BlockSize1, unsigned int ItemsPerThread1, + unsigned int BlockSize2, unsigned int ItemsPerThread2, + unsigned int MaxItemsPerThread1 +> +auto sort_keys_benchmark_generate_ipt1_grid( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< ItemsPerThread1 < MaxItemsPerThread1, void>::type +{ + sort_keys_add_benchmark< + Key, Value, LRB, SRB, + BlockSize1, ItemsPerThread1, + BlockSize2, ItemsPerThread2 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, LRB, SRB, + BlockSize1, ItemsPerThread1 + 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); +} + +template< + typename Key, typename Value, + unsigned int BlockSize1, + unsigned int BlockSize2, unsigned int ItemsPerThread2, + unsigned int MaxItemsPerThread1 +> +void sort_keys_benchmark_generate_radix_grid( + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, 4, 3, + BlockSize1, 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, 5, 4, + BlockSize1, 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, 6, 4, + BlockSize1, 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, 7, 6, + BlockSize1, 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_ipt1_grid< + Key, Value, 8, 7, + BlockSize1, 1, + BlockSize2, ItemsPerThread2, + MaxItemsPerThread1 + >(benchmarks, stream, size); +} + +template< + typename Key, typename Value = ::rocprim::empty_type, + unsigned int BlockSize1 = 256U, + unsigned int BlockSize2 = 256U, + unsigned int MaxItemsPerThread1 = 20U +> +void sort_keys_benchmark_generate( + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + sort_keys_benchmark_generate_radix_grid< + Key, Value, + BlockSize1, + BlockSize2, 1, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_radix_grid< + Key, Value, + BlockSize1, + BlockSize2, 2, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_radix_grid< + Key, Value, + BlockSize1, + BlockSize2, 4, + MaxItemsPerThread1 + >(benchmarks, stream, size); + + sort_keys_benchmark_generate_radix_grid< + Key, Value, + BlockSize1, + BlockSize2, 8, + MaxItemsPerThread1 + >(benchmarks, stream, size); +} + +// Compilation may never finish, if the compiler needs to compile too many kernels, +// it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used +// (all other sort_keys_benchmark_generate should be commented/removed). +void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); +} + +void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + //using custom_float2 = custom_type; + using custom_double2 = custom_type; + + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + //sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); + sort_keys_benchmark_generate(benchmarks, stream, size); +} #else // BENCHMARK_CONFIG_TUNING @@ -373,48 +529,47 @@ benchmarks.push_back( \ ); \ } -#endif // BENCHMARK_CONFIG_TUNING - -// Compilation may never finish, if the compiler needs to compile too many kernels, -// it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used -// (all other CREATE_*_BENCHMARK should be commented/removed). - -void add_sort_keys_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - CREATE_SORT_KEYS_BENCHMARK(int) - CREATE_SORT_KEYS_BENCHMARK(long long) - CREATE_SORT_KEYS_BENCHMARK(int8_t) - CREATE_SORT_KEYS_BENCHMARK(uint8_t) - CREATE_SORT_KEYS_BENCHMARK(rocprim::half) - CREATE_SORT_KEYS_BENCHMARK(short) -} + // Compilation may never finish, if the compiler needs to compile too many kernels, + // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used + // (all other CREATE_*_BENCHMARK should be commented/removed). + void add_sort_keys_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) + { + CREATE_SORT_KEYS_BENCHMARK(int) + CREATE_SORT_KEYS_BENCHMARK(long long) + CREATE_SORT_KEYS_BENCHMARK(int8_t) + CREATE_SORT_KEYS_BENCHMARK(uint8_t) + CREATE_SORT_KEYS_BENCHMARK(rocprim::half) + CREATE_SORT_KEYS_BENCHMARK(short) + } -void add_sort_pairs_benchmarks(std::vector& benchmarks, - hipStream_t stream, - size_t size) -{ - using custom_float2 = custom_type; - using custom_double2 = custom_type; + void add_sort_pairs_benchmarks(std::vector& benchmarks, + hipStream_t stream, + size_t size) + { + using custom_float2 = custom_type; + using custom_double2 = custom_type; + + CREATE_SORT_PAIRS_BENCHMARK(int, float) + CREATE_SORT_PAIRS_BENCHMARK(int, double) + CREATE_SORT_PAIRS_BENCHMARK(int, float2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(int, double2) + CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) + + CREATE_SORT_PAIRS_BENCHMARK(long long, float) + CREATE_SORT_PAIRS_BENCHMARK(long long, double) + CREATE_SORT_PAIRS_BENCHMARK(long long, float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) + CREATE_SORT_PAIRS_BENCHMARK(long long, double2) + CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) + CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) + CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) + CREATE_SORT_PAIRS_BENCHMARK(rocprim::half, rocprim::half) + } - CREATE_SORT_PAIRS_BENCHMARK(int, float) - CREATE_SORT_PAIRS_BENCHMARK(int, double) - CREATE_SORT_PAIRS_BENCHMARK(int, float2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(int, double2) - CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) - - CREATE_SORT_PAIRS_BENCHMARK(long long, float) - CREATE_SORT_PAIRS_BENCHMARK(long long, double) - CREATE_SORT_PAIRS_BENCHMARK(long long, float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) - CREATE_SORT_PAIRS_BENCHMARK(long long, double2) - CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) - CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) - CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) - CREATE_SORT_PAIRS_BENCHMARK(rocprim::half, rocprim::half) -} +#endif // BENCHMARK_CONFIG_TUNING int main(int argc, char *argv[]) { diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp index bbf9f74be..ad7c27076 100644 --- a/benchmark/benchmark_device_reduce.cpp +++ b/benchmark/benchmark_device_reduce.cpp @@ -70,8 +70,8 @@ void run_benchmark(benchmark::State& state, T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp index aee6be050..61a7ef062 100644 --- a/benchmark/benchmark_device_reduce_by_key.cpp +++ b/benchmark/benchmark_device_reduce_by_key.cpp @@ -86,7 +86,7 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea std::iota(values_input.begin(), values_input.end(), 0); key_type * d_keys_input; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), @@ -96,7 +96,7 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea ); value_type * d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), @@ -108,9 +108,9 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea key_type * d_unique_output; value_type * d_aggregates_output; unsigned int * d_unique_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_output), unique_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_aggregates_output), unique_count * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_count_output), sizeof(unsigned int))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp index a96366c62..282e4a79b 100644 --- a/benchmark/benchmark_device_run_length_encode.cpp +++ b/benchmark/benchmark_device_run_length_encode.cpp @@ -80,7 +80,7 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ } key_type * d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -92,9 +92,9 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_ key_type * d_unique_output; count_type * d_counts_output; count_type * d_runs_count_output; - HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_output), runs_count * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_counts_output), runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_runs_count_output), sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; @@ -183,7 +183,7 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, } key_type * d_input; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -195,9 +195,9 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, offset_type * d_offsets_output; count_type * d_counts_output; count_type * d_runs_count_output; - HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); - HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); - HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets_output), runs_count * sizeof(offset_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_counts_output), runs_count * sizeof(count_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_runs_count_output), sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp index b474cd50e..a5086a2e0 100644 --- a/benchmark/benchmark_device_scan.cpp +++ b/benchmark/benchmark_device_scan.cpp @@ -117,8 +117,8 @@ void run_benchmark(benchmark::State& state, T initial_value = T(123); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -186,43 +186,87 @@ void run_benchmark(benchmark::State& state, #ifdef BENCHMARK_CONFIG_TUNING -#define CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, IPT) \ -benchmark::RegisterBenchmark( \ - (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \ - ("<" #T ", " #SCAN_OP ", scan_config<" #BS ", " #IPT ", " #BSA "> >")).c_str(), \ - run_benchmark >, size, stream, SCAN_OP() \ -), +template< + bool EXCL, + typename T, typename SCAN_OP, + rocprim::block_scan_algorithm BSA, + unsigned int BS, unsigned int IPT +> +void scan_add_benchmark( + std::vector& benchmarks, + hipStream_t stream, + size_t size) +{ + benchmarks.push_back( + benchmark::RegisterBenchmark( + (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + "<" + typeid(T).name() + + ", " + typeid(SCAN_OP).name() + ", scan_config<" + std::to_string(BS) + ", " + + std::to_string(IPT) + ", " + std::string(BSA == rocprim::block_scan_algorithm::using_warp_scan + ? "using_warp_scan" : "using_reduce_scan") + "> >").c_str(), + run_benchmark< + EXCL, T, SCAN_OP, + typename rocprim::scan_config< + BS, IPT, true, + rocprim::block_load_method::block_load_transpose, + rocprim::block_store_method::block_store_transpose, BSA + > + >, size, stream, SCAN_OP() + ) + ); +} -#define CREATE_BENCHMARK1(EXCL, T, SCAN_OP, BSA, BS) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 1) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 2) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 3) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 4) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 5) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 6) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 7) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 8) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 9) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 10) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 11) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 12) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 13) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 14) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 15) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 16) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 17) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 18) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 19) \ - CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 20) +template< + bool EXCL, + typename T, typename SCAN_OP, + rocprim::block_scan_algorithm BSA, + unsigned int BS, unsigned int IPT, + unsigned int MaxItemsPerThread +> +auto scan_benchmark_generate_ipt_grid( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< IPT == MaxItemsPerThread, void>::type +{ + scan_add_benchmark< + EXCL, T, SCAN_OP, + BSA, BS, IPT + >(benchmarks, stream, size); +} + +template< + bool EXCL, + typename T, typename SCAN_OP, + rocprim::block_scan_algorithm BSA, + unsigned int BS, unsigned int IPT, + unsigned int MaxItemsPerThread +> +auto scan_benchmark_generate_ipt_grid( + std::vector& benchmarks, + hipStream_t stream, + size_t size) + -> typename std::enable_if< IPT < MaxItemsPerThread, void>::type +{ + scan_add_benchmark< + EXCL, T, SCAN_OP, + BSA, BS, IPT + >(benchmarks, stream, size); + + scan_benchmark_generate_ipt_grid< + EXCL, T, SCAN_OP, + BSA, BS, IPT + 1, + MaxItemsPerThread + >(benchmarks, stream, size); +} constexpr rocprim::block_scan_algorithm using_warp_scan = rocprim::block_scan_algorithm::using_warp_scan; constexpr rocprim::block_scan_algorithm reduce_then_scan = rocprim::block_scan_algorithm::reduce_then_scan; -#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ - CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 64) \ - CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 128) \ - CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 256) \ - CREATE_BENCHMARK1(EXCL, T, SCAN_OP, reduce_then_scan, 256) +#define CREATE_BENCHMARK(EXCL, T, SCAN_OP, MIPT) \ + scan_benchmark_generate_ipt_grid(benchmarks, stream, size); \ + scan_benchmark_generate_ipt_grid(benchmarks, stream, size); \ + scan_benchmark_generate_ipt_grid(benchmarks, stream, size); \ + scan_benchmark_generate_ipt_grid(benchmarks, stream, size); #else // BENCHMARK_CONFIG_TUNING @@ -256,48 +300,77 @@ int main(int argc, char *argv[]) std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = custom_type; - using custom_float2 = custom_type; - // Compilation may never finish, if the compiler needs to compile too many kernels, - // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used - // (all other CREATE_*_BENCHMARK should be commented/removed). +#ifndef BENCHMARK_CONFIG_TUNING + using custom_float2 = custom_type; +#endif // Add benchmarks - std::vector benchmarks = - { - CREATE_BENCHMARK(false, int, rocprim::plus) - CREATE_BENCHMARK(true, int, rocprim::plus) + #ifdef BENCHMARK_CONFIG_TUNING + // Compilation may never finish, if the compiler needs to compile too many kernels, + // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used + // (all other CREATE_BENCHMARK should be commented/removed). + + std::vector benchmarks; + + CREATE_BENCHMARK(false, int, rocprim::plus, 20) + CREATE_BENCHMARK(true, int, rocprim::plus, 20) + + CREATE_BENCHMARK(false, float, rocprim::plus, 20) + CREATE_BENCHMARK(true, float, rocprim::plus, 20) + + CREATE_BENCHMARK(false, double, rocprim::plus, 15) + CREATE_BENCHMARK(true, double, rocprim::plus, 15) + + CREATE_BENCHMARK(false, long long, rocprim::plus, 15) + CREATE_BENCHMARK(true, long long, rocprim::plus, 15) + + CREATE_BENCHMARK(false, custom_double2, rocprim::plus, 15) + CREATE_BENCHMARK(true, custom_double2, rocprim::plus, 15) + + CREATE_BENCHMARK(false, int8_t, rocprim::plus, 20) + CREATE_BENCHMARK(true, int8_t, rocprim::plus, 20) + + CREATE_BENCHMARK(false, rocprim::half, rocprim::plus, 30) + CREATE_BENCHMARK(true, rocprim::half, rocprim::plus, 30) + #else + std::vector benchmarks = + { + CREATE_BENCHMARK(false, int, rocprim::plus) + CREATE_BENCHMARK(true, int, rocprim::plus) + + CREATE_BENCHMARK(false, float, rocprim::plus) + CREATE_BENCHMARK(true, float, rocprim::plus) - CREATE_BENCHMARK(false, float, rocprim::plus) - CREATE_BENCHMARK(true, float, rocprim::plus) + CREATE_BENCHMARK(false, double, rocprim::plus) + CREATE_BENCHMARK(true, double, rocprim::plus) - CREATE_BENCHMARK(false, double, rocprim::plus) - CREATE_BENCHMARK(true, double, rocprim::plus) + CREATE_BENCHMARK(false, long long, rocprim::plus) + CREATE_BENCHMARK(true, long long, rocprim::plus) - CREATE_BENCHMARK(false, long long, rocprim::plus) - CREATE_BENCHMARK(true, long long, rocprim::plus) + CREATE_BENCHMARK(false, float2, rocprim::plus) + CREATE_BENCHMARK(true, float2, rocprim::plus) - CREATE_BENCHMARK(false, float2, rocprim::plus) - CREATE_BENCHMARK(true, float2, rocprim::plus) + CREATE_BENCHMARK(false, custom_float2, rocprim::plus) + CREATE_BENCHMARK(true, custom_float2, rocprim::plus) - CREATE_BENCHMARK(false, custom_float2, rocprim::plus) - CREATE_BENCHMARK(true, custom_float2, rocprim::plus) + CREATE_BENCHMARK(false, double2, rocprim::plus) + CREATE_BENCHMARK(true, double2, rocprim::plus) - CREATE_BENCHMARK(false, double2, rocprim::plus) - CREATE_BENCHMARK(true, double2, rocprim::plus) + CREATE_BENCHMARK(false, custom_double2, rocprim::plus) + CREATE_BENCHMARK(true, custom_double2, rocprim::plus) - CREATE_BENCHMARK(false, custom_double2, rocprim::plus) - CREATE_BENCHMARK(true, custom_double2, rocprim::plus) + CREATE_BENCHMARK(false, int8_t, rocprim::plus) + CREATE_BENCHMARK(true, int8_t, rocprim::plus) - CREATE_BENCHMARK(false, int8_t, rocprim::plus) - CREATE_BENCHMARK(true, int8_t, rocprim::plus) + CREATE_BENCHMARK(false, uint8_t, rocprim::plus) + CREATE_BENCHMARK(true, uint8_t, rocprim::plus) - CREATE_BENCHMARK(false, uint8_t, rocprim::plus) - CREATE_BENCHMARK(true, uint8_t, rocprim::plus) + CREATE_BENCHMARK(false, rocprim::half, rocprim::plus) + CREATE_BENCHMARK(true, rocprim::half, rocprim::plus) + }; + #endif - CREATE_BENCHMARK(false, rocprim::half, rocprim::plus) - CREATE_BENCHMARK(true, rocprim::half, rocprim::plus) - }; // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp index 5bcd31775..9d1801563 100644 --- a/benchmark/benchmark_device_segmented_radix_sort.cpp +++ b/benchmark/benchmark_device_segmented_radix_sort.cpp @@ -101,7 +101,7 @@ void run_sort_keys_benchmark(benchmark::State& state, } offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets), (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), @@ -112,8 +112,8 @@ void run_sort_keys_benchmark(benchmark::State& state, key_type * d_keys_input; key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), @@ -232,7 +232,7 @@ void run_sort_pairs_benchmark(benchmark::State& state, std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets), (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), @@ -243,8 +243,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, key_type * d_keys_input; key_type * d_keys_output; - HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); - HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), @@ -255,8 +255,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, value_type * d_values_input; value_type * d_values_output; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); - HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp index 557e18619..b3038ed7a 100644 --- a/benchmark/benchmark_device_segmented_reduce.cpp +++ b/benchmark/benchmark_device_segmented_reduce.cpp @@ -86,7 +86,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; - HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets), (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), @@ -96,7 +96,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t ); value_type * d_values_input; - HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), @@ -106,7 +106,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t ); value_type * d_aggregates_output; - HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_aggregates_output), segments_count * sizeof(value_type))); rocprim::plus reduce_op; value_type init(0); diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp index 7a375909d..6a8552862 100644 --- a/benchmark/benchmark_device_select.cpp +++ b/benchmark/benchmark_device_select.cpp @@ -79,10 +79,10 @@ void run_flagged_benchmark(benchmark::State& state, FlagType * d_flags; T * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), flags.size() * sizeof(FlagType))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -188,9 +188,9 @@ void run_selectop_benchmark(benchmark::State& state, T * d_input; T * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), @@ -296,9 +296,9 @@ void run_unique_benchmark(benchmark::State& state, T * d_input; T * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_device_transform.cpp b/benchmark/benchmark_device_transform.cpp index f26384df3..88c0d5499 100644 --- a/benchmark/benchmark_device_transform.cpp +++ b/benchmark/benchmark_device_transform.cpp @@ -79,8 +79,8 @@ void run_benchmark(benchmark::State& state, T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 9868859ca..57b28eb91 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -40,13 +40,13 @@ // Support half operators on host side ROCPRIM_HOST inline -_Float16 half_to_native(const rocprim::half& x) +rocprim::native_half half_to_native(const rocprim::half& x) { - return *reinterpret_cast(&x); + return *reinterpret_cast(&x); } ROCPRIM_HOST inline -rocprim::half native_to_half(const _Float16& x) +rocprim::half native_to_half(const rocprim::native_half& x) { return *reinterpret_cast(&x); } @@ -90,15 +90,38 @@ struct half_equal_to } }; +// std::uniform_int_distribution is undefined for anything other than listed +// https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution +template +struct is_valid_for_int_distribution : + std::integral_constant::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value + > {}; + +using engine_type = std::default_random_engine; + // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) +template +inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); - std::uniform_int_distribution distribution(min, max); + engine_type gen{std::random_device{}()}; + using dis_type = typename std::conditional< + is_valid_for_int_distribution::value, + T, + typename std::conditional::value, + int, + unsigned int>::type + >::type; + std::uniform_int_distribution distribution((T)min, (T)max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), @@ -111,14 +134,14 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = return data; } -template -inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) +template +inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); + engine_type gen{std::random_device{}()}; + // Generate floats when T is half using dis_type = typename std::conditional::value, float, T>::type; - std::uniform_real_distribution distribution(min, max); + std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), @@ -134,8 +157,7 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = template inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = 1024 * 1024) { - std::random_device rd; - std::default_random_engine gen(rd()); + engine_type gen{std::random_device{}()}; std::bernoulli_distribution distribution(p); std::vector data(size); std::generate( diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp index 9753ea787..7e996c6bc 100644 --- a/benchmark/benchmark_warp_reduce.cpp +++ b/benchmark/benchmark_warp_reduce.cpp @@ -63,13 +63,13 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_reduce_kernel(const T * d_input, T * d_output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().reduce(value, value, storage); @@ -88,14 +88,14 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().head_segmented_reduce(value, value, flag, storage); @@ -167,9 +167,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) T * d_input; flag_type * d_flags; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), size * sizeof(flag_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp index c36cdd1a9..97a53083e 100644 --- a/benchmark/benchmark_warp_scan.cpp +++ b/benchmark/benchmark_warp_scan.cpp @@ -59,12 +59,12 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_inclusive_scan_kernel(const T* input, T* output) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().inclusive_scan(value, value, storage); @@ -78,12 +78,12 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_exclusive_scan_kernel(const T* input, T* output, const T init) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().exclusive_scan(value, value, init, storage); @@ -104,11 +104,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1)/BlockSize); // Allocate and fill memory - std::vector input(size, 1.0f); + std::vector input(size, (T)1); T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/benchmark/benchmark_warp_sort.cpp b/benchmark/benchmark_warp_sort.cpp index 0c611dfeb..7c491bdf7 100644 --- a/benchmark/benchmark_warp_sort.cpp +++ b/benchmark/benchmark_warp_sort.cpp @@ -59,11 +59,11 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_sort_kernel(K* input_key) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto key = input_key[i]; rp::warp_sort wsort; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wsort.sort(key); @@ -76,12 +76,12 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_sort_by_key_kernel(K* input_key, V* input_value) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto key = input_key[i]; auto value = input_value[i]; rp::warp_sort wsort; - #pragma nounroll + ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wsort.sort(key, value); @@ -121,13 +121,13 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1)/BlockSize); // Allocate and fill memory - std::vector input_key = get_random_data(size, Key(0), get_max_value()); + std::vector input_key = get_random_data(size, 0, get_max_value()); std::vector input_value(size_t(1)); - if(SortByKey) input_value = get_random_data(size, Value(0), get_max_value()); + if(SortByKey) input_value = get_random_data(size, 0, get_max_value()); Key * d_input_key = nullptr; Value * d_input_value = nullptr; - HIP_CHECK(hipMalloc(&d_input_key, size * sizeof(Key))); - if(SortByKey) HIP_CHECK(hipMalloc(&d_input_value, size * sizeof(Value))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_key), size * sizeof(Key))); + if(SortByKey) HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_value), size * sizeof(Value))); HIP_CHECK( hipMemcpy( d_input_key, input_key.data(), diff --git a/benchmark/cmdparser.hpp b/benchmark/cmdparser.hpp index a502c77be..0dfc73ca2 100644 --- a/benchmark/cmdparser.hpp +++ b/benchmark/cmdparser.hpp @@ -186,6 +186,13 @@ namespace cli { return std::stoul(elements[0]); } + static unsigned long long parse(const std::vector& elements, const unsigned long long&) { + if (elements.size() != 1) + throw std::bad_cast(); + + return std::stoull(elements[0]); + } + static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ee9e037fe..9a3dadfe6 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -43,50 +43,138 @@ if (NOT Git_FOUND) message(FATAL_ERROR "Please ensure Git is installed on the system") endif() +if(USE_HIP_CPU) + find_package(Threads REQUIRED) + + set(CMAKE_REQUIRED_FLAGS "-std=c++17") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(__GLIBCXX__ "cstddef" STL_IS_GLIBCXX) + set(STL_DEPENDS_ON_TBB ${STL_IS_GLIBCXX}) + if(STL_DEPENDS_ON_TBB) + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + # TBB (https://github.com/oneapi-src/oneTBB) + find_package(TBB QUIET) + endif() + + if(NOT TBB_FOUND) + message(STATUS "TBB not found or force download TBB on. Downloading and building TBB.") + if(CMAKE_CONFIGURATION_TYPES) + message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.") + endif() + set(TBB_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/tbb CACHE PATH "") + download_project( + PROJ tbb + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG v2020.3 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository + ) + #ExternalProject_Get_Property(tbb SOURCE_DIR) + set(TBB_SOURCE_DIR "${CMAKE_BINARY_DIR}/tbb-src") + list(APPEND CMAKE_MODULE_PATH "${TBB_SOURCE_DIR}/cmake") + include(TBBBuild) + tbb_build(TBB_ROOT "${TBB_SOURCE_DIR}" CONFIG_DIR TBB_CONFIG_DIR MAKE_ARGS tbb_build_dir=${TBB_ROOT}) + endif() + find_package(TBB REQUIRED CONFIG PATHS ${TBB_CONFIG_DIR} NO_DEFAULT_PATH) + endif(STL_DEPENDS_ON_TBB) + + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + # HIP CPU Runtime (https://github.com/ROCm-Developer-Tools/HIP-CPU) + find_package(hip_cpu_rt QUIET) + endif() + + if(NOT hip_cpu_rt_FOUND) + message(STATUS "Downloading and building HIP CPU Runtime.") + set(HIP_CPU_ROOT "${CMAKE_CURRENT_BINARY_DIR}/deps/hip-cpu" CACHE PATH "") + download_project( + PROJ hip-cpu + GIT_REPOSITORY https://github.com/ROCm-Developer-Tools/HIP-CPU.git + GIT_TAG master + INSTALL_DIR "${HIP_CPU_ROOT}" + CMAKE_ARGS -Dhip_cpu_rt_BUILD_EXAMPLES=OFF -Dhip_cpu_rt_BUILD_TESTING=OFF -DCMAKE_PREFIX_PATH=${TBB_CONFIG_DIR} -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository + ) + endif() + find_package(hip_cpu_rt REQUIRED CONFIG PATHS ${HIP_CPU_ROOT}) +endif() + # Test dependencies if(BUILD_TEST) - # Google Test (https://github.com/google/googletest) - message(STATUS "Downloading and building GTest.") - set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") - download_project( - PROJ googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-1.10.0 - INSTALL_DIR ${GTEST_ROOT} - CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - ${UPDATE_DISCONNECTED_IF_AVAILABLE} - ) - find_package(GTest REQUIRED) + # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake + # + # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND + # + # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main + # + # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets + # will be duplicately defined. + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + # Google Test (https://github.com/google/googletest) + find_package(GTest QUIET) + endif() + + if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) + message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.") + if(CMAKE_CONFIGURATION_TYPES) + message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.") + endif() + set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/gtest CACHE PATH "") + download_project( + PROJ googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG release-1.10.0 + INSTALL_DIR ${GTEST_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DCMAKE_INSTALL_PREFIX= + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository + ) + find_package(GTest CONFIG REQUIRED PATHS ${GTEST_ROOT}) + endif() endif() # Benchmark dependencies if(BUILD_BENCHMARK) - # Google Benchmark (https://github.com/google/benchmark.git) - message(STATUS "Downloading and building Google Benchmark.") - if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$") - # hip-clang cannot compile googlebenchmark for some reason - set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++") + if(NOT DEPENDENCIES_FORCE_DOWNLOAD) + # Google Benchmark (https://github.com/google/benchmark.git) + find_package(benchmark QUIET) + endif() + + if(NOT benchmark_FOUND) + message(STATUS "Google Benchmark not found or force download Google Benchmark on. Downloading and building Google Benchmark.") + if(CMAKE_CONFIGURATION_TYPES) + message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.") + endif() + set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/googlebenchmark CACHE PATH "") + if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$") + # hip-clang cannot compile googlebenchmark for some reason + set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++") + endif() + + download_project( + PROJ googlebenchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.4.0 + INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX= ${COMPILER_OVERRIDE} + LOG_DOWNLOAD TRUE + LOG_CONFIGURE TRUE + LOG_BUILD TRUE + LOG_INSTALL TRUE + BUILD_PROJECT TRUE + ${UPDATE_DISCONNECTED_IF_AVAILABLE} + ) endif() - # Download, build and install googlebenchmark library - set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "") - download_project( - PROJ googlebenchmark - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.4.0 - INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} - CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= ${COMPILER_OVERRIDE} - LOG_DOWNLOAD TRUE - LOG_CONFIGURE TRUE - LOG_BUILD TRUE - LOG_INSTALL TRUE - BUILD_PROJECT TRUE - ${UPDATE_DISCONNECTED_IF_AVAILABLE} - ) find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) endif() diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 19844cbb7..9d7e9accf 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -25,14 +25,18 @@ function(print_configuration_summary) message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") - message(STATUS " HIP ROOT : ${HIP_ROOT_DIR}") + if(NOT USE_HIP_CPU) + message(STATUS " HIP ROOT : ${HIP_ROOT_DIR}") + endif() message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") - message(STATUS " Device targets : ${AMDGPU_TARGETS}") + if(NOT USE_HIP_CPU) + message(STATUS " Device targets : ${AMDGPU_TARGETS}") + endif() message(STATUS "") message(STATUS " DISABLE_WERROR : ${DISABLE_WERROR}") message(STATUS " ONLY_INSTALL : ${ONLY_INSTALL}") @@ -40,4 +44,5 @@ function(print_configuration_summary) message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_EXAMPLE : ${BUILD_EXAMPLE}") message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}") + message(STATUS " USE_HIP_CPU : ${USE_HIP_CPU}") endfunction() diff --git a/example/example_temporary_storage.cpp b/example/example_temporary_storage.cpp index 3cf61a70a..9c523dbec 100644 --- a/example/example_temporary_storage.cpp +++ b/example/example_temporary_storage.cpp @@ -37,7 +37,7 @@ __launch_bounds__(BlockSize) void example_shared_memory(const T *input, T *output) { // Indexing for this block - unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; // Allocating storage in shared memory for the block using block_scan_type = rocprim::block_scan; @@ -137,7 +137,7 @@ void example_union_storage_types(const T *input, T *output) } storage; constexpr int items_per_block = BlockSize * ItemsPerThread; - int block_offset = (hipBlockIdx_x * items_per_block); + int block_offset = (blockIdx.x * items_per_block); // Input/output array for block scan primitive T values[ItemsPerThread]; @@ -226,7 +226,7 @@ __launch_bounds__(BlockSize) void example_dynamic_shared_memory(const T *input, T *output) { // Indexing for this block - unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; // Initialize primitives using block_scan_type = rocprim::block_scan; @@ -310,7 +310,7 @@ void example_global_memory_storage( typename rocprim::block_scan::storage_type *global_storage) { // Indexing for this block - unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; // specialize block_scan for type T and block of 256 threads using block_scan_type = rocprim::block_scan; // Variables required for performing a scan @@ -322,7 +322,7 @@ void example_global_memory_storage( block_scan_type() .inclusive_scan( input_value, output_value, - global_storage[hipBlockIdx_x], + global_storage[blockIdx.x], rocprim::plus() ); diff --git a/rocprim/include/rocprim/block/block_histogram.hpp b/rocprim/include/rocprim/block/block_histogram.hpp index 3d7df08bb..4684967a4 100644 --- a/rocprim/include/rocprim/block/block_histogram.hpp +++ b/rocprim/include/rocprim/block/block_histogram.hpp @@ -161,7 +161,7 @@ class block_histogram { const auto flat_tid = ::rocprim::flat_block_thread_id(); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 0; offset < Bins; offset += BlockSize) { const unsigned int offset_tid = offset + flat_tid; diff --git a/rocprim/include/rocprim/block/block_load.hpp b/rocprim/include/rocprim/block/block_load.hpp index 856b05892..d560a9ee8 100644 --- a/rocprim/include/rocprim/block/block_load.hpp +++ b/rocprim/include/rocprim/block/block_load.hpp @@ -110,7 +110,7 @@ enum class block_load_method /// \code{.cpp} /// __global__ void example_kernel(int * input, ...) /// { -/// const int offset = hipBlockIdx_x * 128 * 8; +/// const int offset = blockIdx.x * 128 * 8; /// int items[8]; /// rocprim::block_load blockload; /// blockload.load(input + offset, items); @@ -403,10 +403,10 @@ class block_load(); - block_load_direct_blocked_vectorized(flat_id, block_input, items); + block_load_direct_blocked_vectorized(flat_id, block_input, _items); } template diff --git a/rocprim/include/rocprim/block/block_load_func.hpp b/rocprim/include/rocprim/block/block_load_func.hpp index bf87c16b8..2a50d073f 100644 --- a/rocprim/include/rocprim/block/block_load_func.hpp +++ b/rocprim/include/rocprim/block/block_load_func.hpp @@ -61,7 +61,7 @@ void block_load_direct_blocked(unsigned int flat_id, { unsigned int offset = flat_id * ItemsPerThread; InputIterator thread_iter = block_input + offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item]; @@ -98,7 +98,7 @@ void block_load_direct_blocked(unsigned int flat_id, { unsigned int offset = flat_id * ItemsPerThread; InputIterator thread_iter = block_input + offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { if (item + offset < valid) @@ -141,11 +141,12 @@ void block_load_direct_blocked(unsigned int flat_id, unsigned int valid, Default out_of_bounds) { - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { - items[item] = out_of_bounds; + items[item] = static_cast(out_of_bounds); } + // TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate block_load_direct_blocked(flat_id, block_input, items, valid); } @@ -181,10 +182,10 @@ template< unsigned int ItemsPerThread > ROCPRIM_DEVICE inline -typename std::enable_if()>::type +auto block_load_direct_blocked_vectorized(unsigned int flat_id, T* block_input, - U (&items)[ItemsPerThread]) + U (&items)[ItemsPerThread]) -> typename std::enable_if::value>::type { typedef typename detail::match_vector_type::type vector_type; constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type); @@ -193,13 +194,13 @@ block_load_direct_blocked_vectorized(unsigned int flat_id, const vector_type* vector_ptr = reinterpret_cast(block_input) + (flat_id * vectors_per_thread); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < vectors_per_thread; item++) { vector_items[item] = *(vector_ptr + item); } - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = *(reinterpret_cast(vector_items) + item); @@ -212,10 +213,10 @@ template< unsigned int ItemsPerThread > ROCPRIM_DEVICE inline -typename std::enable_if()>::type +auto block_load_direct_blocked_vectorized(unsigned int flat_id, T* block_input, - U (&items)[ItemsPerThread]) + U (&items)[ItemsPerThread]) -> typename std::enable_if::value>::type { block_load_direct_blocked(flat_id, block_input, items); } @@ -249,7 +250,7 @@ void block_load_direct_striped(unsigned int flat_id, T (&items)[ItemsPerThread]) { InputIterator thread_iter = block_input + flat_id; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item * BlockSize]; @@ -287,7 +288,7 @@ void block_load_direct_striped(unsigned int flat_id, unsigned int valid) { InputIterator thread_iter = block_input + flat_id; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * BlockSize; @@ -333,7 +334,7 @@ void block_load_direct_striped(unsigned int flat_id, unsigned int valid, Default out_of_bounds) { - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = out_of_bounds; @@ -385,7 +386,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; InputIterator thread_iter = block_input + thread_id + warp_offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = thread_iter[item * WarpSize]; @@ -437,7 +438,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; InputIterator thread_iter = block_input + thread_id + warp_offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * WarpSize; @@ -493,7 +494,7 @@ void block_load_direct_warp_striped(unsigned int flat_id, static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(), "WarpSize must be a power of two and equal or less" "than the size of hardware warp."); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { items[item] = out_of_bounds; diff --git a/rocprim/include/rocprim/block/block_radix_sort.hpp b/rocprim/include/rocprim/block/block_radix_sort.hpp index 9b04860e3..40c2ffeba 100644 --- a/rocprim/include/rocprim/block/block_radix_sort.hpp +++ b/rocprim/include/rocprim/block/block_radix_sort.hpp @@ -109,6 +109,14 @@ class block_bit_plus_scan warp_scan_prefix_type().inclusive_scan(prefix, prefix, ::rocprim::plus()); storage_.warp_prefixes[flat_id] = prefix; } +#ifdef __HIP_CPU_RT__ + else + { + // HIP-CPU doesn't implement lockstep behavior. Need to invoke the same number sync ops in divergent branch. + empty_type empty; + ::rocprim::detail::warp_scan_crosslane().inclusive_scan(empty, empty, empty_binary_op{}); + } +#endif ::rocprim::syncthreads(); // Perform exclusive warp scan of bit values @@ -207,7 +215,7 @@ class block_radix_sort typename bit_keys_exchange_type::storage_type bit_keys_exchange; typename values_exchange_type::storage_type values_exchange; }; - typename bit_block_scan::storage_type bit_block_scan; + typename block_radix_sort::bit_block_scan::storage_type bit_block_scan; }; public: @@ -893,6 +901,11 @@ class block_radix_sort } unsigned int ranks[ItemsPerThread]; +#ifdef __HIP_CPU_RT__ + // TODO: Check if really necessary + // Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory + std::memset(ranks, 0, ItemsPerThread * sizeof(decltype(ranks[0]))); +#endif unsigned int count; bit_block_scan().exclusive_scan(bits, ranks, count, storage_.bit_block_scan); diff --git a/rocprim/include/rocprim/block/block_reduce.hpp b/rocprim/include/rocprim/block/block_reduce.hpp index df9ef6fb9..dc6d3cc67 100644 --- a/rocprim/include/rocprim/block/block_reduce.hpp +++ b/rocprim/include/rocprim/block/block_reduce.hpp @@ -176,7 +176,7 @@ class block_reduce /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_reduce for float and block of 256 threads /// using block_reduce_f = rocprim::block_reduce; @@ -257,7 +257,7 @@ class block_reduce /// each provides two \p long value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_reduce for long and block of 128 threads /// using block_reduce_f = rocprim::block_reduce; @@ -345,7 +345,7 @@ class block_reduce /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_reduce for float and block of 256 threads /// using block_reduce_f = rocprim::block_reduce; diff --git a/rocprim/include/rocprim/block/block_scan.hpp b/rocprim/include/rocprim/block/block_scan.hpp index b30a3bdc6..ac11c45f1 100644 --- a/rocprim/include/rocprim/block/block_scan.hpp +++ b/rocprim/include/rocprim/block/block_scan.hpp @@ -171,7 +171,7 @@ class block_scan /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for float and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -252,7 +252,7 @@ class block_scan /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for float and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -363,7 +363,7 @@ class block_scan /// } /// }; /// - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for int and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -429,7 +429,7 @@ class block_scan /// each provides two \p long value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for long and block of 128 threads /// using block_scan_f = rocprim::block_scan; @@ -532,7 +532,7 @@ class block_scan /// each provides two \p long value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for long and block of 128 threads /// using block_scan_f = rocprim::block_scan; @@ -664,7 +664,7 @@ class block_scan /// } /// }; /// - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for int and block of 128 threads /// using block_scan_f = rocprim::block_scan; @@ -739,7 +739,7 @@ class block_scan /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for float and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -828,7 +828,7 @@ class block_scan /// each provides one \p float value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for float and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -945,7 +945,7 @@ class block_scan /// } /// }; /// - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize block_scan for int and block of 256 threads /// using block_scan_f = rocprim::block_scan; @@ -1013,7 +1013,7 @@ class block_scan /// each provides two \p long value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for long and block of 128 threads /// using block_scan_f = rocprim::block_scan; @@ -1124,7 +1124,7 @@ class block_scan /// each provides two \p long value. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for long and block of 128 threads /// using block_scan_f = rocprim::block_scan; @@ -1263,7 +1263,7 @@ class block_scan /// } /// }; /// - /// __global__ void example_kernel(...) // hipBlockDim_x = 128 + /// __global__ void example_kernel(...) // blockDim.x = 128 /// { /// // specialize block_scan for int and block of 128 threads /// using block_scan_f = rocprim::block_scan; diff --git a/rocprim/include/rocprim/block/block_shuffle.hpp b/rocprim/include/rocprim/block/block_shuffle.hpp index 979a691f9..b9122c165 100644 --- a/rocprim/include/rocprim/block/block_shuffle.hpp +++ b/rocprim/include/rocprim/block/block_shuffle.hpp @@ -309,7 +309,7 @@ class block_shuffle ::rocprim::syncthreads(); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int i = ItemsPerThread - 1; i > 0; --i) { prev[i] = input[i - 1]; @@ -424,7 +424,7 @@ class block_shuffle ::rocprim::syncthreads(); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i) { next[i] = input[i + 1]; diff --git a/rocprim/include/rocprim/block/block_store.hpp b/rocprim/include/rocprim/block/block_store.hpp index 980171dfd..a449a6392 100644 --- a/rocprim/include/rocprim/block/block_store.hpp +++ b/rocprim/include/rocprim/block/block_store.hpp @@ -110,7 +110,7 @@ enum class block_store_method /// \code{.cpp} /// __global__ void kernel(int * output) /// { -/// const int offset = hipBlockIdx_x * 128 * 8; +/// const int offset = blockIdx.x * 128 * 8; /// int items[8]; /// rocprim::block_store blockstore; /// blockstore.store(output + offset, items); @@ -300,10 +300,10 @@ class block_store(); - block_store_direct_blocked_vectorized(flat_id, block_output, items); + block_store_direct_blocked_vectorized(flat_id, block_output, _items); } template diff --git a/rocprim/include/rocprim/block/block_store_func.hpp b/rocprim/include/rocprim/block/block_store_func.hpp index 30eece97b..ff26aaabf 100644 --- a/rocprim/include/rocprim/block/block_store_func.hpp +++ b/rocprim/include/rocprim/block/block_store_func.hpp @@ -65,7 +65,7 @@ void block_store_direct_blocked(unsigned int flat_id, unsigned int offset = flat_id * ItemsPerThread; OutputIterator thread_iter = block_output + offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_iter[item] = items[item]; @@ -106,7 +106,7 @@ void block_store_direct_blocked(unsigned int flat_id, unsigned int offset = flat_id * ItemsPerThread; OutputIterator thread_iter = block_output + offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { if (item + offset < valid) @@ -147,10 +147,10 @@ template< unsigned int ItemsPerThread > ROCPRIM_DEVICE inline -typename std::enable_if()>::type +auto block_store_direct_blocked_vectorized(unsigned int flat_id, T* block_output, - U (&items)[ItemsPerThread]) + U (&items)[ItemsPerThread]) -> typename std::enable_if::value>::type { static_assert(std::is_convertible::value, "The type U must be such that it can be implicitly converted to T."); @@ -162,7 +162,7 @@ block_store_direct_blocked_vectorized(unsigned int flat_id, vector_type raw_vector_items[vectors_per_thread]; T *raw_items = reinterpret_cast(raw_vector_items); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { raw_items[item] = items[item]; @@ -177,10 +177,10 @@ template< unsigned int ItemsPerThread > ROCPRIM_DEVICE inline -typename std::enable_if()>::type +auto block_store_direct_blocked_vectorized(unsigned int flat_id, T* block_output, - U (&items)[ItemsPerThread]) + U (&items)[ItemsPerThread]) -> typename std::enable_if::value>::type { block_store_direct_blocked(flat_id, block_output, items); } @@ -218,7 +218,7 @@ void block_store_direct_striped(unsigned int flat_id, "can be dereferenced and assigned a value of type T."); OutputIterator thread_iter = block_output + flat_id; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_iter[item * BlockSize] = items[item]; @@ -260,7 +260,7 @@ void block_store_direct_striped(unsigned int flat_id, "can be dereferenced and assigned a value of type T."); OutputIterator thread_iter = block_output + flat_id; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * BlockSize; @@ -318,7 +318,7 @@ void block_store_direct_warp_striped(unsigned int flat_id, unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; OutputIterator thread_iter = block_output + thread_id + warp_offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { thread_iter[item * WarpSize] = items[item]; @@ -374,7 +374,7 @@ void block_store_direct_warp_striped(unsigned int flat_id, unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread; OutputIterator thread_iter = block_output + thread_id + warp_offset; - #pragma unroll + ROCPRIM_UNROLL for (unsigned int item = 0; item < ItemsPerThread; item++) { unsigned int offset = item * WarpSize; diff --git a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp index b0d8a4c76..3a76def6f 100644 --- a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp +++ b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp @@ -63,7 +63,7 @@ class block_histogram_atomic std::is_same::value || std::is_same::value, "Counter must be type that is supported by atomics (float, int, unsigned int, unsigned long long)" ); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int i = 0; i < ItemsPerThread; ++i) { ::rocprim::detail::atomic_add(&hist[static_cast(input[i])], Counter(1)); diff --git a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp index 56fae48d0..313b2c99a 100644 --- a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp +++ b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp @@ -86,10 +86,11 @@ class block_histogram_sort Counter hist[Bins], storage_type& storage) { - static_assert( - std::is_convertible::value, - "unsigned int must be convertible to Counter" - ); + // TODO: Check, MSVC rejects the code with the static assertion, yet compiles fine for all tested types. Predicate likely too strict + //static_assert( + // std::is_convertible::value, + // "unsigned int must be convertible to Counter" + //); constexpr auto tile_size = BlockSize * ItemsPerThread; const auto flat_tid = ::rocprim::flat_block_thread_id(); unsigned int head_flags[ItemsPerThread]; @@ -99,7 +100,7 @@ class block_histogram_sort radix_sort().sort(input, storage_.sort); ::rocprim::syncthreads(); // Fix race condition that appeared on Vega10 hardware, storage LDS is reused below. - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 0; offset < Bins; offset += BlockSize) { const unsigned int offset_tid = offset + flat_tid; @@ -121,7 +122,7 @@ class block_histogram_sort } ::rocprim::syncthreads(); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 0; offset < Bins; offset += BlockSize) { const unsigned int offset_tid = offset + flat_tid; diff --git a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp index 82176694e..505649c70 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp @@ -117,7 +117,7 @@ class block_reduce_raking_reduce { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = reduce_op(thread_input, input[i]); diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp index c020c1b3e..d8485a855 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp @@ -106,7 +106,7 @@ class block_reduce_warp_reduce { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = reduce_op(thread_input, input[i]); diff --git a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp index fbbe29c59..b8e2e17d5 100644 --- a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp +++ b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp @@ -145,7 +145,7 @@ class block_scan_reduce_then_scan { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -164,7 +164,7 @@ class block_scan_reduce_then_scan output[0] = input[0]; if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]); // Final thread-local scan - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { output[i] = scan_op(output[i-1], input[i]); @@ -221,7 +221,7 @@ class block_scan_reduce_then_scan storage_type_& storage_ = storage.get(); // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -249,7 +249,7 @@ class block_scan_reduce_then_scan // Include block prefix output[0] = scan_op(block_prefix, output[0]); // Final thread-local scan - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { output[i] = scan_op(output[i-1], input[i]); @@ -344,7 +344,7 @@ class block_scan_reduce_then_scan { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -368,7 +368,7 @@ class block_scan_reduce_then_scan exclusive = thread_input; } output[0] = exclusive; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { exclusive = scan_op(exclusive, prev); @@ -430,7 +430,7 @@ class block_scan_reduce_then_scan storage_type_& storage_ = storage.get(); // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -460,7 +460,7 @@ class block_scan_reduce_then_scan exclusive = scan_op(block_prefix, thread_input); } output[0] = exclusive; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { exclusive = scan_op(exclusive, prev); @@ -507,7 +507,7 @@ class block_scan_reduce_then_scan const unsigned int idx_end = idx_start + thread_reduction_size_; T thread_reduction = storage_.threads[idx_start]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = idx_start + 1; i < idx_end; i++) { thread_reduction = scan_op( @@ -527,7 +527,7 @@ class block_scan_reduce_then_scan } storage_.threads[idx_start] = thread_reduction; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = idx_start + 1; i < idx_end; i++) { thread_reduction = scan_op( diff --git a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp index 84b8a6f2a..34762ee01 100644 --- a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp +++ b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp @@ -157,7 +157,7 @@ class block_scan_warp_scan { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -179,7 +179,7 @@ class block_scan_warp_scan output[0] = scan_op(thread_input, input[0]); } // Final thread-local scan - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { output[i] = scan_op(output[i-1], input[i]); @@ -236,7 +236,7 @@ class block_scan_warp_scan storage_type_& storage_ = storage.get(); // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -267,7 +267,7 @@ class block_scan_warp_scan // Include block prefix output[0] = scan_op(block_prefix, output[0]); // Final thread-local scan - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { output[i] = scan_op(output[i-1], input[i]); @@ -366,7 +366,7 @@ class block_scan_warp_scan { // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -391,7 +391,7 @@ class block_scan_warp_scan } output[0] = exclusive; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { exclusive = scan_op(exclusive, prev); @@ -453,7 +453,7 @@ class block_scan_warp_scan storage_type_& storage_ = storage.get(); // Reduce thread items T thread_input = input[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { thread_input = scan_op(thread_input, input[i]); @@ -484,7 +484,7 @@ class block_scan_warp_scan } output[0] = exclusive; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ItemsPerThread; i++) { exclusive = scan_op(exclusive, prev); diff --git a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp index 16060a79d..84180e548 100644 --- a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp +++ b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp @@ -240,11 +240,11 @@ class block_sort_bitonic }; wsort.sort(kv..., compare_function2); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int length = ::rocprim::device_warp_size(); length < Size; length *= 2) { bool dir = (flat_tid & (length * 2)) != 0; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int k = length; k > 0; k /= 2) { copy_to_shared(kv..., flat_tid, storage); @@ -301,7 +301,7 @@ class block_sort_bitonic unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 : ::rocprim::min(flat_tid + 1, Size - 1); unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, Size - 1) : ::rocprim::max(flat_tid, 1u) - 1; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int length = 0; length < Size; length++) { unsigned int next_id = (length % 2) == 0 ? even_id : odd_id; diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp index 0a9f6197e..caa108a31 100644 --- a/rocprim/include/rocprim/config.hpp +++ b/rocprim/include/rocprim/config.hpp @@ -93,4 +93,12 @@ #define ROCPRIM_WARP_SIZE_64 64u #define ROCPRIM_MAX_WARP_SIZE ROCPRIM_WARP_SIZE_64 +#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__GNUC__) +#define ROCPRIM_UNROLL +#define ROCPRIM_NO_UNROLL +#else +#define ROCPRIM_UNROLL _Pragma("unroll") +#define ROCPRIM_NO_UNROLL _Pragma("nounroll") +#endif + #endif // ROCPRIM_CONFIG_HPP_ diff --git a/rocprim/include/rocprim/detail/various.hpp b/rocprim/include/rocprim/detail/various.hpp index 613f57ea4..d98f653af 100644 --- a/rocprim/include/rocprim/detail/various.hpp +++ b/rocprim/include/rocprim/detail/various.hpp @@ -127,12 +127,7 @@ struct match_vector_type // Checks if Items is odd and ensures that size of T is smaller than vector_type. template -ROCPRIM_HOST_DEVICE -constexpr bool is_vectorizable() -{ - return (Items % 2 == 0) && - (sizeof(T) < sizeof(typename match_vector_type::type)); -} +struct is_vectorizable : std::integral_constant::type))> {}; // Returns the number of LDS (local data share) banks. ROCPRIM_HOST_DEVICE @@ -168,7 +163,13 @@ ROCPRIM_DEVICE inline auto store_volatile(T * output, T value) -> typename std::enable_if::value>::type { + // TODO: check GCC + // error: binding reference of type ‘const half_float::half&’ to ‘volatile half_float::half’ discards qualifiers +#if !(defined(__HIP_CPU_RT__ ) && defined(__GNUC__)) *const_cast(output) = value; +#else + *output = value; +#endif } template @@ -182,7 +183,7 @@ auto store_volatile(T * output, T value) auto input_ptr = reinterpret_cast(&value); auto output_ptr = reinterpret_cast(output); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < n; i++) { output_ptr[i] = input_ptr[i]; @@ -194,8 +195,14 @@ ROCPRIM_DEVICE inline auto load_volatile(T * input) -> typename std::enable_if::value, T>::type { + // TODO: check GCC + // error: binding reference of type ‘const half_float::half&’ to ‘volatile half_float::half’ discards qualifiers +#if !(defined(__HIP_CPU_RT__ ) && defined(__GNUC__)) T retval = *const_cast(input); return retval; +#else + return *input; +#endif } template @@ -210,7 +217,7 @@ auto load_volatile(T * input) auto output_ptr = reinterpret_cast(&retval); auto input_ptr = reinterpret_cast(input); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < n; i++) { output_ptr[i] = input_ptr[i]; diff --git a/rocprim/include/rocprim/device/detail/device_merge.hpp b/rocprim/include/rocprim/device/detail/device_merge.hpp index 29673ce58..5d3276cf0 100644 --- a/rocprim/include/rocprim/device/detail/device_merge.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge.hpp @@ -161,7 +161,7 @@ void load(unsigned int flat_id, const size_t input1_size, const size_t input2_size) { - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { unsigned int index = BlockSize * i + flat_id; @@ -193,7 +193,7 @@ void serial_merge(KeyType * keys_shared, KeyType a = keys_shared[range.begin1]; KeyType b = keys_shared[range.begin2]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { bool compare = (range.begin2 >= range.end2) || @@ -298,7 +298,7 @@ merge_values(unsigned int flat_id, if(count >= ItemsPerThread * BlockSize) { - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { values[i] = (index[i] < input1_size) ? values_input1[index[i]] : @@ -307,7 +307,7 @@ merge_values(unsigned int flat_id, } else { - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { if(flat_id * ItemsPerThread + i < count) diff --git a/rocprim/include/rocprim/device/detail/device_partition.hpp b/rocprim/include/rocprim/device/detail/device_partition.hpp index 582fe70a8..cd1039362 100644 --- a/rocprim/include/rocprim/device/detail/device_partition.hpp +++ b/rocprim/include/rocprim/device/detail/device_partition.hpp @@ -182,7 +182,7 @@ auto partition_block_load_flags(InputIterator /* block_predecessor */, if(is_last_block) // last block { const auto offset = block_thread_id * ItemsPerThread; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { if((offset + i) < valid_in_last_block) @@ -197,7 +197,7 @@ auto partition_block_load_flags(InputIterator /* block_predecessor */, } else { - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { is_selected[i] = predicate(values[i]); @@ -314,7 +314,7 @@ auto partition_block_load_flags(InputIterator block_predecessor, if(is_last_block) { const auto offset = block_thread_id * ItemsPerThread; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { if((offset + i) >= valid_in_last_block) @@ -354,7 +354,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread], // Scatter selected/rejected values to shared memory auto scatter_storage = storage.get(); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { unsigned int item_index = (flat_block_thread_id * ItemsPerThread) + i; @@ -366,7 +366,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread], } ::rocprim::syncthreads(); // sync threads to reuse shared memory - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { unsigned int item_index = (i * BlockSize) + flat_block_thread_id; @@ -421,7 +421,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread], { // Scatter selected values to shared memory auto scatter_storage = storage.get(); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { unsigned int scatter_index = output_indices[i] - selected_prefix; @@ -440,7 +440,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread], } else { - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { if(!is_last_block || output_indices[i] < (selected_prefix + selected_in_block)) @@ -578,7 +578,7 @@ void partition_kernel_impl(InputIterator input, ); // Convert true/false is_selected flags to 0s and 1s - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < items_per_thread; i++) { output_indices[i] = is_selected[i] ? 1 : 0; diff --git a/rocprim/include/rocprim/device/detail/device_reduce.hpp b/rocprim/include/rocprim/device/detail/device_reduce.hpp index a234aa943..294ed357c 100644 --- a/rocprim/include/rocprim/device/detail/device_reduce.hpp +++ b/rocprim/include/rocprim/device/detail/device_reduce.hpp @@ -116,7 +116,7 @@ void block_reduce_kernel_impl(InputIterator input, ); output_value = values[0]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < items_per_thread; i++) { unsigned int offset = i * block_size; diff --git a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp index 38d64cdde..dc75b9933 100644 --- a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp @@ -68,8 +68,8 @@ class segmented_radix_sort_helper union storage_type { - typename count_helper_type::storage_type count_helper; - typename sort_and_scatter_helper::storage_type sort_and_scatter_helper; + typename segmented_radix_sort_helper::count_helper_type::storage_type count_helper; + typename segmented_radix_sort_helper::sort_and_scatter_helper::storage_type sort_and_scatter_helper; }; template< @@ -517,9 +517,9 @@ void segmented_sort(KeysInputIterator keys_input, ROCPRIM_SHARED_MEMORY union { - typename single_block_helper::storage_type single_block_helper; - typename long_radix_helper_type::storage_type long_radix_helper; - typename short_radix_helper_type::storage_type short_radix_helper; + typename rocprim::detail::segmented_radix_sort_single_block_helper::storage_type single_block_helper; + typename rocprim::detail::segmented_radix_sort_helper::storage_type long_radix_helper; + typename rocprim::detail::segmented_radix_sort_helper::storage_type short_radix_helper; } storage; const unsigned int segment_id = ::rocprim::detail::block_id<0>(); diff --git a/rocprim/include/rocprim/device/detail/device_transform.hpp b/rocprim/include/rocprim/device/detail/device_transform.hpp index 593ff885e..74aefc16b 100644 --- a/rocprim/include/rocprim/device/detail/device_transform.hpp +++ b/rocprim/include/rocprim/device/detail/device_transform.hpp @@ -109,7 +109,7 @@ void transform_kernel_impl(InputIterator input, valid_in_last_block ); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { if(BlockSize * i + flat_id < valid_in_last_block) @@ -133,7 +133,7 @@ void transform_kernel_impl(InputIterator input, input_values ); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { output_values[i] = transform_op(input_values[i]); diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index 65709d045..947e5a3ec 100644 --- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -122,7 +122,11 @@ struct lookback_scan_state prefix_type prefix; prefix.flag = PREFIX_EMPTY; prefix_underlying_type p; +#ifndef __HIP_CPU_RT__ __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); +#else + std::memcpy(&p, &prefix, sizeof(prefix_type)); +#endif prefixes[padding + block_id] = p; } if(block_id < padding) @@ -130,7 +134,11 @@ struct lookback_scan_state prefix_type prefix; prefix.flag = PREFIX_INVALID; prefix_underlying_type p; +#ifndef __HIP_CPU_RT__ __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); +#else + std::memcpy(&p, &prefix, sizeof(prefix_type)); +#endif prefixes[block_id] = p; } } @@ -155,23 +163,35 @@ struct lookback_scan_state prefix_type prefix; - const uint SLEEP_MAX = 32; - uint times_through = 1; + const unsigned int SLEEP_MAX = 32; + unsigned int times_through = 1; prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0); +#ifndef __HIP_CPU_RT__ __builtin_memcpy(&prefix, &p, sizeof(prefix_type)); +#else + std::memcpy(&prefix, &p, sizeof(prefix_type)); +#endif while(prefix.flag == PREFIX_EMPTY) { if (UseSleep) { - for (uint j = 0; j < times_through; j++) + for (unsigned int j = 0; j < times_through; j++) +#ifndef __HIP_CPU_RT__ __builtin_amdgcn_s_sleep(1); +#else + std::this_thread::sleep_for(std::chrono::microseconds{1}); +#endif if (times_through < SLEEP_MAX) times_through++; } // atomic_add(..., 0) is used to load values atomically prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0); +#ifndef __HIP_CPU_RT__ __builtin_memcpy(&prefix, &p, sizeof(prefix_type)); +#else + std::memcpy(&prefix, &p, sizeof(prefix_type)); +#endif } // return @@ -187,7 +207,11 @@ struct lookback_scan_state prefix_type prefix = { flag, value }; prefix_underlying_type p; +#ifndef __HIP_CPU_RT__ __builtin_memcpy(&p, &prefix, sizeof(prefix_type)); +#else + std::memcpy(&p, &prefix, sizeof(prefix_type)); +#endif ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p); } @@ -273,8 +297,8 @@ struct lookback_scan_state { constexpr unsigned int padding = ::rocprim::device_warp_size(); - const uint SLEEP_MAX = 32; - uint times_through = 1; + const unsigned int SLEEP_MAX = 32; + unsigned int times_through = 1; flag = load_volatile(&prefixes_flags[padding + block_id]); ::rocprim::detail::memory_fence_device(); @@ -282,8 +306,12 @@ struct lookback_scan_state { if (UseSleep) { - for (uint j = 0; j < times_through; j++) + for (unsigned int j = 0; j < times_through; j++) +#ifndef __HIP_CPU_RT__ __builtin_amdgcn_s_sleep(1); +#else + std::this_thread::sleep_for(std::chrono::microseconds{1}); +#endif if (times_through < SLEEP_MAX) times_through++; } diff --git a/rocprim/include/rocprim/device/device_histogram.hpp b/rocprim/include/rocprim/device/device_histogram.hpp index 8cf9a6281..26c94d531 100644 --- a/rocprim/include/rocprim/device/device_histogram.hpp +++ b/rocprim/include/rocprim/device/device_histogram.hpp @@ -110,16 +110,16 @@ void histogram_global_kernel(SampleIterator samples, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } @@ -151,9 +151,9 @@ hipError_t histogram_impl(void * temporary_storage, default_histogram_config >; - constexpr unsigned int block_size = config::histogram::block_size; - constexpr unsigned int items_per_thread = config::histogram::items_per_thread; - constexpr unsigned int items_per_block = block_size * items_per_thread; + static constexpr unsigned int block_size = config::histogram::block_size; + static constexpr unsigned int items_per_thread = config::histogram::items_per_thread; + static constexpr unsigned int items_per_block = block_size * items_per_thread; if(row_stride_bytes % sizeof(sample_type) != 0) { diff --git a/rocprim/include/rocprim/device/device_merge.hpp b/rocprim/include/rocprim/device/device_merge.hpp index 46ecfbc35..16a118950 100644 --- a/rocprim/include/rocprim/device/device_merge.hpp +++ b/rocprim/include/rocprim/device/device_merge.hpp @@ -94,16 +94,16 @@ void merge_kernel(IndexIterator index, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } @@ -142,10 +142,10 @@ hipError_t merge_impl(void * temporary_storage, detail::default_merge_config >; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int half_block = block_size / 2; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr unsigned int block_size = config::block_size; + static constexpr unsigned int half_block = block_size / 2; + static constexpr unsigned int items_per_thread = config::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block; const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int); diff --git a/rocprim/include/rocprim/device/device_merge_sort.hpp b/rocprim/include/rocprim/device/device_merge_sort.hpp index 843c98d59..076b43aea 100644 --- a/rocprim/include/rocprim/device/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort.hpp @@ -98,16 +98,16 @@ void block_merge_kernel(KeysInputIterator keys_input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } @@ -142,7 +142,7 @@ hipError_t merge_sort_impl(void * temporary_storage, >; // Block size - constexpr unsigned int block_size = config::block_size; + static constexpr unsigned int block_size = config::block_size; const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type)); const size_t values_bytes = diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp index 986021723..96839c0b2 100644 --- a/rocprim/include/rocprim/device/device_partition.hpp +++ b/rocprim/include/rocprim/device/device_partition.hpp @@ -96,16 +96,16 @@ void init_offset_scan_state_kernel(OffsetLookBackScanState offset_scan_state, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } @@ -149,9 +149,9 @@ hipError_t partition_impl(void * temporary_storage, using ordered_block_id_type = detail::ordered_block_id; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr unsigned int block_size = config::block_size; + static constexpr unsigned int items_per_thread = config::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; const unsigned int number_of_blocks = std::max(1u, static_cast((size + items_per_block - 1)/items_per_block)); @@ -232,7 +232,7 @@ hipError_t partition_impl(void * temporary_storage, HIP_KERNEL_NAME(partition_kernel< SelectMethod, OnlySelected, config, InputIterator, FlagIterator, OutputIterator, SelectedCountOutputIterator, - UnaryPredicate, decltype(inequality_op), offset_scan_state_with_sleep_type + UnaryPredicate, InequalityOp, offset_scan_state_with_sleep_type >), dim3(grid_size), dim3(block_size), 0, stream, input, flags, output, selected_count_output, size, predicate, @@ -244,7 +244,7 @@ hipError_t partition_impl(void * temporary_storage, HIP_KERNEL_NAME(partition_kernel< SelectMethod, OnlySelected, config, InputIterator, FlagIterator, OutputIterator, SelectedCountOutputIterator, - UnaryPredicate, decltype(inequality_op), offset_scan_state_type + UnaryPredicate, InequalityOp, offset_scan_state_type >), dim3(grid_size), dim3(block_size), 0, stream, input, flags, output, selected_count_output, size, predicate, diff --git a/rocprim/include/rocprim/device/device_radix_sort.hpp b/rocprim/include/rocprim/device/device_radix_sort.hpp index 5419ac0b1..d3f19b274 100644 --- a/rocprim/include/rocprim/device/device_radix_sort.hpp +++ b/rocprim/include/rocprim/device/device_radix_sort.hpp @@ -127,16 +127,16 @@ void sort_and_scatter_kernel(KeysInputIterator keys_input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_radix_sort_config.hpp index 1fe6940cb..84eb68651 100644 --- a/rocprim/include/rocprim/device/device_radix_sort_config.hpp +++ b/rocprim/include/rocprim/device/device_radix_sort_config.hpp @@ -155,6 +155,51 @@ struct radix_sort_config_900 select_type_case, kernel_config<256, 15> > > > { }; + +template +struct radix_sort_config_908 +{ + static constexpr unsigned int item_scale = + ::rocprim::detail::ceiling_div(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int)); + + using scan = kernel_config<256, 2>; + + using type = select_type< + select_type_case< + (sizeof(Key) == 1 && sizeof(Value) <= 8), + radix_sort_config<4, 4, scan, kernel_config<256, 10> > + >, + select_type_case< + (sizeof(Key) == 2 && sizeof(Value) <= 8), + radix_sort_config<6, 5, scan, kernel_config<256, 10> > + >, + select_type_case< + (sizeof(Key) == 4 && sizeof(Value) <= 8), + radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 15> > + >, + select_type_case< + (sizeof(Key) == 8 && sizeof(Value) <= 8), + radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 14> > + >, + radix_sort_config< + 6, 4, scan, + kernel_config< + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, + ::rocprim::max(1u, 15u / item_scale) + > + > + >; +}; + +template +struct radix_sort_config_908 + : select_type< + select_type_case, kernel_config<256, 10> > >, + select_type_case, kernel_config<256, 10> > >, + select_type_case, kernel_config<256, 17> > >, + select_type_case, kernel_config<256, 15> > > + > { }; + // TODO: We need to update these parameters template struct radix_sort_config_90a @@ -251,6 +296,7 @@ struct default_radix_sort_config TargetArch, select_arch_case<803, radix_sort_config_803 >, select_arch_case<900, radix_sort_config_900 >, + select_arch_case<908, radix_sort_config_908 >, select_arch_case >, select_arch_case<1030, radix_sort_config_1030 >, radix_sort_config_900 diff --git a/rocprim/include/rocprim/device/device_reduce.hpp b/rocprim/include/rocprim/device/device_reduce.hpp index 44fc9a6bf..9503d5585 100644 --- a/rocprim/include/rocprim/device/device_reduce.hpp +++ b/rocprim/include/rocprim/device/device_reduce.hpp @@ -65,25 +65,25 @@ void block_reduce_kernel(InputIterator input, if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto _error = hipStreamSynchronize(stream); \ + if(_error != hipSuccess) return _error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/device_reduce_by_key.hpp index 1c0f732c0..9b0fb2e80 100644 --- a/rocprim/include/rocprim/device/device_reduce_by_key.hpp +++ b/rocprim/include/rocprim/device/device_reduce_by_key.hpp @@ -137,16 +137,16 @@ void scan_and_scatter_carry_outs_kernel(const carry_out * carry_outs, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_scan.hpp b/rocprim/include/rocprim/device/device_scan.hpp index d2abcc852..363ec436e 100644 --- a/rocprim/include/rocprim/device/device_scan.hpp +++ b/rocprim/include/rocprim/device/device_scan.hpp @@ -160,16 +160,16 @@ void init_lookback_scan_state_kernel(LookBackScanState lookback_scan_state, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_scan_config.hpp b/rocprim/include/rocprim/device/device_scan_config.hpp index 737399dd1..4d7603935 100644 --- a/rocprim/include/rocprim/device/device_scan_config.hpp +++ b/rocprim/include/rocprim/device/device_scan_config.hpp @@ -62,11 +62,11 @@ struct scan_config /// \brief Whether to use lookback scan or reduce-then-scan algorithm. static constexpr bool use_lookback = UseLookback; /// \brief Method for loading input values. - static constexpr block_load_method block_load_method = BlockLoadMethod; + static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod; /// \brief Method for storing values. - static constexpr block_store_method block_store_method = BlockStoreMethod; + static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod; /// \brief Algorithm for block scan. - static constexpr block_scan_algorithm block_scan_method = BlockScanMethod; + static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod; }; namespace detail @@ -130,7 +130,7 @@ struct scan_config_1030 using type = scan_config< limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value, - ::rocprim::max(1u, 16u / item_scale), + ::rocprim::max(1u, 15u / item_scale), ROCPRIM_DETAIL_USE_LOOKBACK_SCAN, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp index e5855b538..2a93832d0 100644 --- a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp +++ b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp @@ -83,16 +83,16 @@ void segmented_sort_kernel(KeysInputIterator keys_input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/device_segmented_reduce.hpp index 29261ea2f..55955381d 100644 --- a/rocprim/include/rocprim/device/device_segmented_reduce.hpp +++ b/rocprim/include/rocprim/device/device_segmented_reduce.hpp @@ -67,16 +67,16 @@ void segmented_reduce_kernel(InputIterator input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_segmented_scan.hpp b/rocprim/include/rocprim/device/device_segmented_scan.hpp index d1f1765ae..bcc175a25 100644 --- a/rocprim/include/rocprim/device/device_segmented_scan.hpp +++ b/rocprim/include/rocprim/device/device_segmented_scan.hpp @@ -73,16 +73,16 @@ void segmented_scan_kernel(InputIterator input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + auto __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) return __error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } diff --git a/rocprim/include/rocprim/device/device_transform.hpp b/rocprim/include/rocprim/device/device_transform.hpp index 2c381032f..994ae5c06 100644 --- a/rocprim/include/rocprim/device/device_transform.hpp +++ b/rocprim/include/rocprim/device/device_transform.hpp @@ -63,16 +63,16 @@ void transform_kernel(InputIterator input, #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ { \ - auto error = hipPeekAtLastError(); \ - if(error != hipSuccess) return error; \ + auto _error = hipPeekAtLastError(); \ + if(_error != hipSuccess) return _error; \ if(debug_synchronous) \ { \ std::cout << name << "(" << size << ")"; \ - auto error = hipStreamSynchronize(stream); \ - if(error != hipSuccess) return error; \ - auto end = std::chrono::high_resolution_clock::now(); \ - auto d = std::chrono::duration_cast>(end - start); \ - std::cout << " " << d.count() * 1000 << " ms" << '\n'; \ + _error = hipStreamSynchronize(stream); \ + if(_error != hipSuccess) return _error; \ + auto _end = std::chrono::high_resolution_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ } \ } @@ -158,9 +158,9 @@ hipError_t transform(InputIterator input, detail::default_transform_config >; - constexpr unsigned int block_size = config::block_size; - constexpr unsigned int items_per_thread = config::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr unsigned int block_size = config::block_size; + static constexpr unsigned int items_per_thread = config::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; // Start point for time measurements std::chrono::high_resolution_clock::time_point start; diff --git a/rocprim/include/rocprim/functional.hpp b/rocprim/include/rocprim/functional.hpp index fcd41a998..5d7fde783 100644 --- a/rocprim/include/rocprim/functional.hpp +++ b/rocprim/include/rocprim/functional.hpp @@ -33,9 +33,9 @@ BEGIN_ROCPRIM_NAMESPACE #define ROCPRIM_PRINT_ERROR_ONCE(message) \ { \ - unsigned int idx = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); \ - idx += hipThreadIdx_y + (hipBlockIdx_y * hipBlockDim_y); \ - idx += hipThreadIdx_z + (hipBlockIdx_z * hipBlockDim_z); \ + unsigned int idx = threadIdx.x + (blockIdx.x * blockDim.x); \ + idx += threadIdx.y + (blockIdx.y * blockDim.y); \ + idx += threadIdx.z + (blockIdx.z * blockDim.z); \ if (idx == 0) \ printf("%s\n", #message); \ } diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp index 2ff0e9ef7..155e6eb98 100644 --- a/rocprim/include/rocprim/intrinsics/thread.hpp +++ b/rocprim/include/rocprim/intrinsics/thread.hpp @@ -76,7 +76,7 @@ constexpr unsigned int device_warp_size() ROCPRIM_DEVICE inline unsigned int flat_block_size() { - return hipBlockDim_z * hipBlockDim_y * hipBlockDim_x; + return blockDim.z * blockDim.y * blockDim.x; } /// \brief Returns flat size of a multidimensional tile (block). @@ -92,16 +92,21 @@ unsigned int flat_tile_size() ROCPRIM_DEVICE inline unsigned int lane_id() { +#ifndef __HIP_CPU_RT__ return ::__lane_id(); +#else + using namespace hip::detail; + return id(Fiber::this_fiber()) % warpSize; +#endif } /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). ROCPRIM_DEVICE inline unsigned int flat_block_thread_id() { - return (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) - + (hipThreadIdx_y * hipBlockDim_x) - + hipThreadIdx_x; + return (threadIdx.z * blockDim.y * blockDim.x) + + (threadIdx.y * blockDim.x) + + threadIdx.x; } /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). Use template parameters to optimize 1D or 2D kernels. @@ -110,7 +115,7 @@ ROCPRIM_DEVICE inline auto flat_block_thread_id() -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type { - return hipThreadIdx_x; + return threadIdx.x; } template @@ -118,7 +123,7 @@ ROCPRIM_DEVICE inline auto flat_block_thread_id() -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type { - return hipThreadIdx_x + (hipThreadIdx_y * hipBlockDim_x); + return threadIdx.x + (threadIdx.y * blockDim.x); } template @@ -126,8 +131,8 @@ ROCPRIM_DEVICE inline auto flat_block_thread_id() -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type { - return hipThreadIdx_x + (hipThreadIdx_y * hipBlockDim_x) + - (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x); + return threadIdx.x + (threadIdx.y * blockDim.x) + + (threadIdx.z * blockDim.y * blockDim.x); } /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block). @@ -162,9 +167,9 @@ unsigned int warp_id() ROCPRIM_DEVICE inline unsigned int flat_block_id() { - return (hipBlockIdx_z * hipGridDim_y * hipGridDim_x) - + (hipBlockIdx_y * hipGridDim_x) - + hipBlockIdx_x; + return (blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + + blockIdx.x; } template @@ -172,7 +177,7 @@ ROCPRIM_DEVICE inline auto flat_block_id() -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type { - return hipBlockIdx_x; + return blockIdx.x; } template @@ -180,7 +185,7 @@ ROCPRIM_DEVICE inline auto flat_block_id() -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type { - return hipBlockIdx_x + (hipBlockIdx_y * hipGridDim_x); + return blockIdx.x + (blockIdx.y * gridDim.x); } template @@ -188,8 +193,8 @@ ROCPRIM_DEVICE inline auto flat_block_id() -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type { - return hipBlockIdx_x + (hipBlockIdx_y * hipGridDim_x) + - (hipBlockIdx_z * hipGridDim_y * hipGridDim_x); + return blockIdx.x + (blockIdx.y * gridDim.x) + + (blockIdx.z * gridDim.y * gridDim.x); } // Sync @@ -243,7 +248,7 @@ namespace detail return 0; } - #define ROCPRIM_DETAIL_CONCAT(A, B) A ## B + #define ROCPRIM_DETAIL_CONCAT(A, B) A B #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \ template<> \ ROCPRIM_DEVICE inline \ @@ -256,10 +261,10 @@ namespace detail ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \ ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z) - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, hipThreadIdx_) - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, hipBlockIdx_) - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, hipBlockDim_) - ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, hipGridDim_) + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, threadIdx.) + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, blockIdx.) + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, blockDim.) + ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, gridDim.) #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 4814bb227..67872821b 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -47,20 +47,27 @@ ROCPRIM_DEVICE inline unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0) { int c; - #if __AMDGCN_WAVEFRONT_SIZE == 32 - #ifdef __HIP__ - c = ::__builtin_amdgcn_mbcnt_lo(x, add); + #ifndef __HIP_CPU_RT__ + #if __AMDGCN_WAVEFRONT_SIZE == 32 + #ifdef __HIP__ + c = ::__builtin_amdgcn_mbcnt_lo(x, add); + #else + c = ::__mbcnt_lo(x, add); + #endif #else - c = ::__mbcnt_lo(x, add); + #ifdef __HIP__ + c = ::__builtin_amdgcn_mbcnt_lo(static_cast(x), add); + c = ::__builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), c); + #else + c = ::__mbcnt_lo(static_cast(x), add); + c = ::__mbcnt_hi(static_cast(x >> 32), c); + #endif #endif #else - #ifdef __HIP__ - c = ::__builtin_amdgcn_mbcnt_lo(static_cast(x), add); - c = ::__builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), c); - #else - c = ::__mbcnt_lo(static_cast(x), add); - c = ::__mbcnt_hi(static_cast(x >> 32), c); - #endif + using namespace hip::detail; + const auto tidx{id(Fiber::this_fiber()) % warpSize}; + std::bitset bits{x >> (warpSize - tidx)}; + c = static_cast(bits.count()) + add; #endif return c; } @@ -71,13 +78,37 @@ namespace detail ROCPRIM_DEVICE inline int warp_any(int predicate) { +#ifndef __HIP_CPU_RT__ return ::__any(predicate); +#else + using namespace hip::detail; + const auto tidx{id(Fiber::this_fiber()) % warpSize}; + auto& lds{Tile::scratchpad, 1>()[0]}; + + lds[tidx] = static_cast(predicate); + + barrier(Tile::this_tile()); + + return lds.any(); +#endif } ROCPRIM_DEVICE inline int warp_all(int predicate) { +#ifndef __HIP_CPU_RT__ return ::__all(predicate); +#else + using namespace hip::detail; + const auto tidx{id(Fiber::this_fiber()) % warpSize}; + auto& lds{Tile::scratchpad, 1>()[0]}; + + lds[tidx] = static_cast(predicate); + + barrier(Tile::this_tile()); + + return lds.all(); +#endif } } // end detail namespace @@ -96,7 +127,7 @@ unsigned int MatchAny(unsigned int label) unsigned int retval; // Extract masks of common threads for each bit - #pragma unroll + ROCPRIM_UNROLL for (int BIT = 0; BIT < LABEL_BITS; ++BIT) { unsigned long long mask; diff --git a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp index 5c8734297..1caac2900 100644 --- a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp +++ b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp @@ -34,6 +34,26 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { +#ifdef __HIP_CPU_RT__ +// Taken from the notes of https://en.cppreference.com/w/cpp/numeric/bit_cast +// +// TODO: consider adding macro checks relaying to std::bit_cast when compiled +// using C++20. +template +typename std::enable_if_t< + sizeof(To) == sizeof(From) && + std::is_trivially_copyable_v && + std::is_trivially_copyable_v, + To> +// constexpr support needs compiler magic +bit_cast(const From& src) noexcept +{ + To dst; + std::memcpy(&dst, &src, sizeof(To)); + return dst; +} +#endif + template ROCPRIM_DEVICE inline typename std::enable_if::value && (sizeof(T) % sizeof(int) == 0), T>::type @@ -42,15 +62,23 @@ warp_shuffle_op(const T& input, ShuffleOp&& op) constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); struct V { int words[words_no]; }; +#ifdef __HIP_CPU_RT__ + V a = bit_cast(input); +#else V a = __builtin_bit_cast(V, input); +#endif - #pragma unroll + ROCPRIM_UNROLL for(int i = 0; i < words_no; i++) { a.words[i] = op(a.words[i]); } +#ifdef __HIP_CPU_RT__ + return bit_cast(a); +#else return __builtin_bit_cast(T, a); +#endif } template @@ -61,17 +89,26 @@ warp_shuffle_op(const T& input, ShuffleOp&& op) constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int); T output; - #pragma unroll + ROCPRIM_UNROLL for(int i = 0; i < words_no; i++) { const size_t s = std::min(sizeof(int), sizeof(T) - i * sizeof(int)); int word; +#ifdef __HIP_CPU_RT__ + std::memcpy(&word, reinterpret_cast(&input) + i * sizeof(int), s); +#else __builtin_memcpy(&word, reinterpret_cast(&input) + i * sizeof(int), s); +#endif word = op(word); +#ifdef __HIP_CPU_RT__ + std::memcpy(reinterpret_cast(&output) + i * sizeof(int), &word, s); +#else __builtin_memcpy(reinterpret_cast(&output) + i * sizeof(int), &word, s); +#endif } return output; + } template @@ -82,7 +119,17 @@ T warp_move_dpp(const T& input) input, [=](int v) -> int { + // TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and + // parsing of the template happens unconditionally. The condition causing compilation to + // fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define + // __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC + // because even using /permissive- they somehow still do delayed parsing of the body of + // function templates, even though they pinky-swear they don't.) +#if !defined(__HIP_CPU_RT__) return ::__builtin_amdgcn_update_dpp(0, v, dpp_ctrl, row_mask, bank_mask, bound_ctrl); +#else + return v; +#endif } ); } diff --git a/rocprim/include/rocprim/iterator.hpp b/rocprim/include/rocprim/iterator.hpp index 224f25ea0..41e359ae2 100644 --- a/rocprim/include/rocprim/iterator.hpp +++ b/rocprim/include/rocprim/iterator.hpp @@ -28,7 +28,9 @@ #include "iterator/constant_iterator.hpp" #include "iterator/counting_iterator.hpp" #include "iterator/discard_iterator.hpp" +#ifndef __HIP_CPU_RT__ #include "iterator/texture_cache_iterator.hpp" +#endif #include "iterator/transform_iterator.hpp" #include "iterator/zip_iterator.hpp" diff --git a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp index 8e80b7adf..d01612dc7 100644 --- a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp +++ b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp @@ -208,7 +208,7 @@ class texture_cache_iterator #else texture_type words[multiple]; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 0; i < multiple; i++) { tex1Dfetch( diff --git a/rocprim/include/rocprim/thread/thread_load.hpp b/rocprim/include/rocprim/thread/thread_load.hpp index 46476ea9f..35994f999 100644 --- a/rocprim/include/rocprim/thread/thread_load.hpp +++ b/rocprim/include/rocprim/thread/thread_load.hpp @@ -87,6 +87,8 @@ ROCPRIM_DEVICE __forceinline__ T AsmThreadLoad(void * ptr) ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_cmd); \ ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_cmd); +// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile') +#ifndef __HIP_CPU_RT__ ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", ""); ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", ""); ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "vmcnt"); @@ -95,6 +97,7 @@ ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "vmcnt"); // TODO find correct modifiers to match these ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", ""); ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", ""); +#endif // __HIP_CPU_RT__ #endif @@ -129,7 +132,13 @@ template < ROCPRIM_DEVICE inline T thread_load(T* ptr) { +#ifndef __HIP_CPU_RT__ return detail::AsmThreadLoad(ptr); +#else + T retval; + std::memcpy(&retval, ptr, sizeof(T)); + return retval; +#endif } END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/thread/thread_reduce.hpp b/rocprim/include/rocprim/thread/thread_reduce.hpp index 783722f9b..3ce9fdda0 100644 --- a/rocprim/include/rocprim/thread/thread_reduce.hpp +++ b/rocprim/include/rocprim/thread/thread_reduce.hpp @@ -60,7 +60,7 @@ ROCPRIM_DEVICE inline T thread_reduce( else retval = prefix; - #pragma unroll + ROCPRIM_UNROLL for (int i = 0 + NoPrefix; i < LENGTH; ++i) retval = reduction_op(retval, input[i]); diff --git a/rocprim/include/rocprim/thread/thread_scan.hpp b/rocprim/include/rocprim/thread/thread_scan.hpp index 98b0ea98e..8b52f9302 100644 --- a/rocprim/include/rocprim/thread/thread_scan.hpp +++ b/rocprim/include/rocprim/thread/thread_scan.hpp @@ -74,7 +74,7 @@ struct Int2Type ScanOp scan_op, ///< [in] Binary scan operator Int2Type /*length*/) { - #pragma unroll + ROCPRIM_UNROLL for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(exclusive, input[i]); @@ -166,7 +166,7 @@ struct Int2Type ScanOp scan_op, ///< [in] Binary scan operator Int2Type /*length*/) { - #pragma unroll + ROCPRIM_UNROLL for (int i = 0; i < LENGTH; ++i) { inclusive = scan_op(inclusive, input[i]); diff --git a/rocprim/include/rocprim/thread/thread_store.hpp b/rocprim/include/rocprim/thread/thread_store.hpp index bce00e963..236ed825a 100644 --- a/rocprim/include/rocprim/thread/thread_store.hpp +++ b/rocprim/include/rocprim/thread/thread_store.hpp @@ -88,6 +88,8 @@ ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, T val) ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \ ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd); +// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile') +#ifndef __HIP_CPU_RT__ ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", ""); ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", ""); ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "vmcnt"); @@ -95,6 +97,7 @@ ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "vmcnt"); // TODO find correct modifiers to match these ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", ""); +#endif // __HIP_CPU_RT__ #endif @@ -131,7 +134,11 @@ ROCPRIM_DEVICE inline void thread_store( T *ptr, T val) { +#ifndef __HIP_CPU_RT__ detail::AsmThreadStore(ptr, val); +#else + std::memcpy(ptr, &val, sizeof(T)); +#endif } END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/types.hpp b/rocprim/include/rocprim/types.hpp index c849a9089..59ff680d2 100644 --- a/rocprim/include/rocprim/types.hpp +++ b/rocprim/include/rocprim/types.hpp @@ -38,35 +38,62 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { - // Define vector types that will be used by rocPRIM internally. // We don't use HIP vector types because they don't generate correct // load/store operations, see https://github.com/RadeonOpenCompute/ROCm/issues/341 +#ifndef _MSC_VER #define DEFINE_VECTOR_TYPE(name, base) \ \ -struct name##2 \ +struct alignas(sizeof(base) * 2) name##2 \ { \ typedef base vector_value_type __attribute__((ext_vector_type(2))); \ union { \ vector_value_type data; \ struct { base x, y; }; \ }; \ -} __attribute__((aligned(sizeof(base) * 2))); \ +}; \ \ -struct name##4 \ +struct alignas(sizeof(base) * 4) name##4 \ { \ typedef base vector_value_type __attribute__((ext_vector_type(4))); \ union { \ vector_value_type data; \ struct { base x, y, w, z; }; \ }; \ -} __attribute__((aligned(sizeof(base) * 4))); +}; +#else +#define DEFINE_VECTOR_TYPE(name, base) \ +\ +struct alignas(sizeof(base) * 2) name##2 \ +{ \ + typedef base vector_value_type; \ + union { \ + vector_value_type data; \ + struct { base x, y; }; \ + }; \ +}; \ +\ +struct alignas(sizeof(base) * 4) name##4 \ +{ \ + typedef base vector_value_type; \ + union { \ + vector_value_type data; \ + struct { base x, y, w, z; }; \ + }; \ +}; +#endif +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4201 ) // nonstandard extension used: nameless struct/union +#endif DEFINE_VECTOR_TYPE(char, char); DEFINE_VECTOR_TYPE(short, short); DEFINE_VECTOR_TYPE(int, int); DEFINE_VECTOR_TYPE(longlong, long long); - +#ifdef _MSC_VER +#pragma warning( pop ) +#endif // Takes a scalar type T and matches to a vector type based on NumElements. template struct make_vector_type @@ -104,21 +131,36 @@ DEFINE_MAKE_VECTOR_TYPE(longlong, long long); /// \brief Empty type used as a placeholder, usually used to flag that given /// template parameter should not be used. -struct empty_type -{ +struct empty_type {}; +/// \brief Binary operator that takes two instances of empty_type, usually used +/// as nop replacement for the HIP-CPU back-end +struct empty_binary_op +{ + constexpr empty_type operator()(const empty_type&, const empty_type&) const { return empty_type{}; } }; /// \brief Half-precision floating point type using half = ::__half; // The lane_mask_type only exist at device side +#ifndef __AMDGCN_WAVEFRONT_SIZE +// When not compiling with hipcc, we're compiling with HIP-CPU +// TODO: introduce a ROCPRIM-specific macro to query this +#define __AMDGCN_WAVEFRONT_SIZE 64 +#endif #if __AMDGCN_WAVEFRONT_SIZE == 32 using lane_mask_type = unsigned int; #elif __AMDGCN_WAVEFRONT_SIZE == 64 using lane_mask_type = unsigned long long int; #endif +#ifdef __HIP_CPU_RT__ +using native_half = half; +#else +using native_half = _Float16; +#endif + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp index d82247179..43bcf036e 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp @@ -60,7 +60,7 @@ class warp_reduce_shared_mem output = input; store_volatile(&storage_.values[lid], output); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) { if (lid + i < WarpSize && lid < i) @@ -85,7 +85,7 @@ class warp_reduce_shared_mem output = input; store_volatile(&storage_.values[lid], output); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = ceiling >> 1; i > 0; i >>= 1) { if((lid + i) < WarpSize && lid < i && (lid + i) < valid_items) @@ -128,7 +128,7 @@ class warp_reduce_shared_mem auto last = last_in_warp_segment(flag); output = input; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int i = 1; i < ceiling; i *= 2) { store_volatile(&storage_.values[lid], output); diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp index e010c414f..6fcb42392 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp @@ -54,7 +54,7 @@ class warp_reduce_shuffle output = input; T value; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 1; offset < WarpSize; offset *= 2) { value = warp_shuffle_down(output, offset, WarpSize); @@ -78,7 +78,7 @@ class warp_reduce_shuffle output = input; T value; - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 1; offset < WarpSize; offset *= 2) { value = warp_shuffle_down(output, offset, WarpSize); diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp index d52e0893b..dc1a9b929 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp @@ -53,7 +53,7 @@ class warp_scan_shuffle T value; const unsigned int id = detail::logical_lane_id(); - #pragma unroll + ROCPRIM_UNROLL for(unsigned int offset = 1; offset < WarpSize; offset *= 2) { value = warp_shuffle_up(output, offset, WarpSize); diff --git a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp index 5c07c2fe3..ada1a820b 100644 --- a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp @@ -53,21 +53,24 @@ auto last_in_warp_segment(Flag flag) // Make sure last item in logical warp is marked as a tail warp_flags |= lane_mask_type(1) << (WarpSize - 1U); // Calculate logical lane id of the last valid value in the segment +#ifndef __HIP_CPU_RT__ #if __AMDGCN_WAVEFRONT_SIZE == 32 return ::__ffs(warp_flags) - 1; #else return ::__ffsll(warp_flags) - 1; #endif -} - -// Returns logical warp id of the last thread in thread's segment -template -ROCPRIM_DEVICE inline -auto last_in_warp_segment(Flag) - -> typename std::enable_if<(WarpSize > __AMDGCN_WAVEFRONT_SIZE), unsigned int>::type -{ - ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort."); - return 0; +#else +#if _MSC_VER + // TODO: verify correctness + unsigned long tmp = 0; + _BitScanReverse64(&tmp, warp_flags); + return 1u << tmp; +#elif __GNUC__ + return __builtin_ctzl(warp_flags); +#else + static_assert(false, "Look for GCC/Clang implementation"); +#endif +#endif } } // end namespace detail diff --git a/rocprim/include/rocprim/warp/warp_reduce.hpp b/rocprim/include/rocprim/warp/warp_reduce.hpp index 1f33f6ae5..3bfd05485 100644 --- a/rocprim/include/rocprim/warp/warp_reduce.hpp +++ b/rocprim/include/rocprim/warp/warp_reduce.hpp @@ -94,7 +94,7 @@ struct select_warp_reduce_impl /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// -/// int logical_warp_id = hipThreadIdx_x/16; +/// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute reduce /// warp_reduce_int().reduce( @@ -163,7 +163,7 @@ class warp_reduce /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// - /// int logical_warp_id = hipThreadIdx_x/16; + /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute reduction /// warp_reduce_int().reduce( @@ -234,7 +234,7 @@ class warp_reduce /// // allocate storage in shared memory /// __shared__ warp_reduce_int::storage_type temp[4]; /// - /// int logical_warp_id = hipThreadIdx_x/16; + /// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// int valid_items = 4; /// // execute reduction diff --git a/rocprim/include/rocprim/warp/warp_scan.hpp b/rocprim/include/rocprim/warp/warp_scan.hpp index 22fb28eb2..684b324c9 100644 --- a/rocprim/include/rocprim/warp/warp_scan.hpp +++ b/rocprim/include/rocprim/warp/warp_scan.hpp @@ -92,7 +92,7 @@ struct select_warp_scan_impl /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; /// -/// int logical_warp_id = hipThreadIdx_x/16; +/// int logical_warp_id = threadIdx.x/16; /// int value = ...; /// // execute inclusive scan /// warp_scan_int().inclusive_scan( @@ -153,14 +153,14 @@ class warp_scan /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// - /// int logical_warp_id = hipThreadIdx_x/32; + /// int logical_warp_id = threadIdx.x/32; /// float value = ...; /// // execute inclusive min scan /// warp_scan_float().inclusive_scan( @@ -227,14 +227,14 @@ class warp_scan /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// - /// int logical_warp_id = hipThreadIdx_x/64; + /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int output, reduction; /// // inclusive prefix sum @@ -306,14 +306,14 @@ class warp_scan /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// - /// int logical_warp_id = hipThreadIdx_x/32; + /// int logical_warp_id = threadIdx.x/32; /// float value = ...; /// // execute exclusive min scan /// warp_scan_float().exclusive_scan( @@ -387,14 +387,14 @@ class warp_scan /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// - /// int logical_warp_id = hipThreadIdx_x/64; + /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int output, reduction; /// // exclusive prefix sum @@ -471,14 +471,14 @@ class warp_scan /// Hardware warp size is 64. Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for float and logical warp of 32 threads /// using warp_scan_f = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8 /// - /// int logical_warp_id = hipThreadIdx_x/32; + /// int logical_warp_id = threadIdx.x/32; /// float input = ...; /// float ex_output, in_output; /// // execute exclusive min scan @@ -561,14 +561,14 @@ class warp_scan /// Block (tile) size is 256. /// /// \code{.cpp} - /// __global__ void example_kernel(...) // hipBlockDim_x = 256 + /// __global__ void example_kernel(...) // blockDim.x = 256 /// { /// // specialize warp_scan for int and logical warp of 64 threads /// using warp_scan_int = rocprim::warp_scan; /// // allocate storage in shared memory /// __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4 /// - /// int logical_warp_id = hipThreadIdx_x/64; + /// int logical_warp_id = threadIdx.x/64; /// int input = ...; /// int in_output, ex_output, reduction; /// // inclusive and exclusive prefix sum diff --git a/rocprim/include/rocprim/warp/warp_sort.hpp b/rocprim/include/rocprim/warp/warp_sort.hpp index 80762f472..cc8ed32df 100644 --- a/rocprim/include/rocprim/warp/warp_sort.hpp +++ b/rocprim/include/rocprim/warp/warp_sort.hpp @@ -64,7 +64,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \code{.cpp} /// __global__ void example_kernel(...) /// { -/// const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +/// const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; /// /// int value = input[i]; /// rocprim::warp_sort wsort; @@ -82,7 +82,7 @@ BEGIN_ROCPRIM_NAMESPACE /// ... /// __global__ void example_kernel(...) /// { -/// const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; +/// const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; /// /// int value = input[i]; /// rocprim::warp_sort wsort; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 073456d2d..497ae8a6c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -32,23 +32,48 @@ function(add_hip_test TEST_NAME TEST_SOURCES) target_include_directories(${TEST_TARGET} SYSTEM BEFORE PUBLIC - ${GTEST_INCLUDE_DIRS} ${COMMON_TEST_HEADER_DIRECTORY} ) - target_link_libraries(${TEST_TARGET} PRIVATE hip::device) - - target_link_libraries(${TEST_TARGET} - PRIVATE - ${GTEST_BOTH_LIBRARIES} - ) - - foreach(amdgpu_target ${AMDGPU_TARGETS}) + if(TARGET GTest::GTest) + target_link_libraries(${TEST_TARGET} + PRIVATE + GTest::GTest + GTest::Main + ) + else() + target_link_libraries(${TEST_TARGET} + PRIVATE + GTest::gtest + GTest::gtest_main + ) + endif() + if(NOT USE_HIP_CPU) + target_link_libraries(${TEST_TARGET} + PRIVATE + rocprim_hip + ) + else() target_link_libraries(${TEST_TARGET} PRIVATE - --amdgpu-target=${amdgpu_target} + rocprim + Threads::Threads + hip_cpu_rt::hip_cpu_rt ) - endforeach() + if(STL_DEPENDS_ON_TBB) + target_link_libraries(${TEST_TARGET} + PRIVATE + TBB::tbb + ) + endif() + endif() + + target_compile_options(${TEST_TARGET} + PRIVATE + $<$: + /bigobj # number of sections exceeded object file format limit: compile with /bigobj + > + ) set_target_properties(${TEST_TARGET} PROPERTIES diff --git a/test/common_test_header.hpp b/test/common_test_header.hpp index 3c1505c78..6f71730bc 100755 --- a/test/common_test_header.hpp +++ b/test/common_test_header.hpp @@ -39,7 +39,9 @@ // HIP API #include #include +#ifndef __HIP_CPU_RT__ #include +#endif #ifndef HIP_CHECK #define HIP_CHECK(condition) \ @@ -61,16 +63,30 @@ namespace test_common_utils int obtain_device_from_ctest() { +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4996 ) // This function or variable may be unsafe. Consider using _dupenv_s instead. +#endif static const std::string rg0 = "CTEST_RESOURCE_GROUP_0"; if (std::getenv(rg0.c_str()) != nullptr) { std::string amdgpu_target = std::getenv(rg0.c_str()); - std::transform(amdgpu_target.cbegin(), amdgpu_target.cend(), amdgpu_target.begin(), ::toupper); + std::transform( + amdgpu_target.cbegin(), + amdgpu_target.cend(), + amdgpu_target.begin(), + // Feeding std::toupper plainly results in implicitly truncating conversions between int and char triggering warnings. + // See: https://en.cppreference.com/mwiki/index.php?title=cpp/string/byte/toupper&oldid=94327 + [](unsigned char c){ return static_cast(std::toupper(c)); } + ); std::string reqs = std::getenv((rg0 + "_" + amdgpu_target).c_str()); return std::atoi(reqs.substr(reqs.find(':') + 1, reqs.find(',') - (reqs.find(':') + 1)).c_str()); } else return 0; +#ifdef _MSC_VER +#pragma warning( pop ) +#endif } bool use_hmm() @@ -93,11 +109,11 @@ hipError_t hipMallocHelper(T** devPtr, size_t size) { if (use_hmm()) { - return hipMallocManaged((void**)devPtr, size); + return hipMallocManaged(reinterpret_cast(devPtr), size); } else { - return hipMalloc((void**)devPtr, size); + return hipMalloc(reinterpret_cast(devPtr), size); } return hipSuccess; } diff --git a/test/extra/test_rocprim_package.cpp b/test/extra/test_rocprim_package.cpp index 911c2f658..afa9f4ae5 100644 --- a/test/extra/test_rocprim_package.cpp +++ b/test/extra/test_rocprim_package.cpp @@ -47,8 +47,8 @@ int main(int, char**) // device input/output T * d_input; T * d_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_output, sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index c9993d851..22f1f94a3 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -23,23 +23,54 @@ function(add_rocprim_test TEST_NAME TEST_SOURCES) list(GET TEST_SOURCES 0 TEST_MAIN_SOURCE) get_filename_component(TEST_TARGET ${TEST_MAIN_SOURCE} NAME_WE) + add_executable(${TEST_TARGET} ${TEST_SOURCES}) + target_include_directories(${TEST_TARGET} SYSTEM BEFORE PUBLIC - ${GTEST_INCLUDE_DIRS} ${COMMON_TEST_HEADER_DIRECTORY} ) - target_link_libraries(${TEST_TARGET} - PRIVATE - rocprim_hip - ${GTEST_BOTH_LIBRARIES} - ) - foreach(amdgpu_target ${AMDGPU_TARGETS}) + + if(TARGET GTest::GTest) + target_link_libraries(${TEST_TARGET} + PRIVATE + GTest::GTest + GTest::Main + ) + else() target_link_libraries(${TEST_TARGET} PRIVATE - --amdgpu-target=${amdgpu_target} + GTest::gtest + GTest::gtest_main ) - endforeach() + endif() + if(NOT USE_HIP_CPU) + target_link_libraries(${TEST_TARGET} + PRIVATE + rocprim_hip + ) + else() + target_link_libraries(${TEST_TARGET} + PRIVATE + rocprim + Threads::Threads + hip_cpu_rt::hip_cpu_rt + ) + if(STL_DEPENDS_ON_TBB) + target_link_libraries(${TEST_TARGET} + PRIVATE + TBB::tbb + ) + endif() + endif() + + target_compile_options(${TEST_TARGET} + PRIVATE + $<$: + /bigobj # number of sections exceeded object file format limit: compile with /bigobj + > + ) + set_target_properties(${TEST_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test/rocprim" @@ -98,7 +129,9 @@ add_rocprim_test("rocprim.device_segmented_scan" test_device_segmented_scan.cpp) add_rocprim_test("rocprim.device_select" test_device_select.cpp) add_rocprim_test("rocprim.device_transform" test_device_transform.cpp) add_rocprim_test("rocprim.discard_iterator" test_discard_iterator.cpp) -add_rocprim_test("rocprim.texture_cache_iterator" test_texture_cache_iterator.cpp) +if(NOT USE_HIP_CPU) + add_rocprim_test("rocprim.texture_cache_iterator" test_texture_cache_iterator.cpp) +endif() add_rocprim_test("rocprim.thread" test_thread.cpp) add_rocprim_test("rocprim.thread_algos" test_thread_algos.cpp) add_rocprim_test("rocprim.transform_iterator" test_transform_iterator.cpp) diff --git a/test/rocprim/bounds_checking_iterator.hpp b/test/rocprim/bounds_checking_iterator.hpp index 8fb4240d8..9433517e8 100644 --- a/test/rocprim/bounds_checking_iterator.hpp +++ b/test/rocprim/bounds_checking_iterator.hpp @@ -158,7 +158,7 @@ class out_of_bounds_flag public: out_of_bounds_flag() { - hipMalloc(&device_pointer_, sizeof(bool)); + hipMalloc(reinterpret_cast(&device_pointer_), sizeof(bool)); hipMemset(device_pointer_, 0, sizeof(bool)); } diff --git a/test/rocprim/detail/get_rocprim_version.cpp b/test/rocprim/detail/get_rocprim_version.cpp index 1d2078c05..bd3a11c27 100644 --- a/test/rocprim/detail/get_rocprim_version.cpp +++ b/test/rocprim/detail/get_rocprim_version.cpp @@ -34,7 +34,7 @@ unsigned int get_rocprim_version_on_device() unsigned int version = 0; unsigned int * d_version; - HIP_CHECK(hipMalloc(&d_version, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_version), sizeof(unsigned int))); HIP_CHECK(hipDeviceSynchronize()); hipLaunchKernelGGL( diff --git a/test/rocprim/test_block_adjacent_difference.cpp b/test/rocprim/test_block_adjacent_difference.cpp index e004442e5..2f0564d58 100644 --- a/test/rocprim/test_block_adjacent_difference.cpp +++ b/test/rocprim/test_block_adjacent_difference.cpp @@ -111,9 +111,9 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void flag_heads_kernel(Type* device_input, long long* device_heads) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -121,7 +121,7 @@ void flag_heads_kernel(Type* device_input, long long* device_heads) rocprim::block_adjacent_difference bdiscontinuity; FlagType head_flags[ItemsPerThread]; - if(hipBlockIdx_x % 2 == 1) + if(blockIdx.x % 2 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; bdiscontinuity.flag_heads(head_flags, tile_predecessor_item, input, FlagOpType()); @@ -145,9 +145,9 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void flag_tails_kernel(Type* device_input, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -155,7 +155,7 @@ void flag_tails_kernel(Type* device_input, long long* device_tails) rocprim::block_adjacent_difference bdiscontinuity; FlagType tail_flags[ItemsPerThread]; - if(hipBlockIdx_x % 2 == 0) + if(blockIdx.x % 2 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_tails(tail_flags, tile_successor_item, input, FlagOpType()); @@ -179,9 +179,9 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -190,23 +190,23 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo FlagType head_flags[ItemsPerThread]; FlagType tail_flags[ItemsPerThread]; - if(hipBlockIdx_x % 4 == 0) + if(blockIdx.x % 4 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 1) + else if(blockIdx.x % 4 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 2) + else if(blockIdx.x % 4 == 2) { const Type tile_predecessor_item = device_input[block_offset - 1]; bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 3) + else if(blockIdx.x % 4 == 3) { bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, FlagOpType()); } @@ -236,11 +236,11 @@ auto test_block_adjacent_difference() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t size = items_per_block * 20; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -280,9 +280,9 @@ auto test_block_adjacent_difference() // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(hipMalloc(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_heads), heads.size() * sizeof(typename decltype(heads)::value_type))); HIP_CHECK( hipMemcpy( @@ -348,11 +348,11 @@ auto test_block_adjacent_difference() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t size = items_per_block * 20; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -392,9 +392,9 @@ auto test_block_adjacent_difference() // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(typename decltype(input)::value_type))); long long* device_tails; - HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_tails), tails.size() * sizeof(typename decltype(tails)::value_type))); HIP_CHECK( hipMemcpy( @@ -460,11 +460,11 @@ auto test_block_adjacent_difference() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; - const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t size = items_per_block * 20; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -516,11 +516,11 @@ auto test_block_adjacent_difference() // Preparing Device type* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(typename decltype(input)::value_type))); long long* device_heads; - HIP_CHECK(hipMalloc(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_heads), tails.size() * sizeof(typename decltype(heads)::value_type))); long long* device_tails; - HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_tails), tails.size() * sizeof(typename decltype(tails)::value_type))); HIP_CHECK( hipMemcpy( diff --git a/test/rocprim/test_block_discontinuity.cpp b/test/rocprim/test_block_discontinuity.cpp index 825d1dde6..ef9ab915d 100644 --- a/test/rocprim/test_block_discontinuity.cpp +++ b/test/rocprim/test_block_discontinuity.cpp @@ -111,9 +111,9 @@ __global__ __launch_bounds__(BlockSize) void flag_heads_kernel(Type* device_input, long long* device_heads) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -121,7 +121,7 @@ void flag_heads_kernel(Type* device_input, long long* device_heads) rocprim::block_discontinuity bdiscontinuity; FlagType head_flags[ItemsPerThread]; - if(hipBlockIdx_x % 2 == 1) + if(blockIdx.x % 2 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; bdiscontinuity.flag_heads(head_flags, tile_predecessor_item, input, FlagOpType()); @@ -145,9 +145,9 @@ __global__ __launch_bounds__(BlockSize) void flag_tails_kernel(Type* device_input, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -155,7 +155,7 @@ void flag_tails_kernel(Type* device_input, long long* device_tails) rocprim::block_discontinuity bdiscontinuity; FlagType tail_flags[ItemsPerThread]; - if(hipBlockIdx_x % 2 == 0) + if(blockIdx.x % 2 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_tails(tail_flags, tile_successor_item, input, FlagOpType()); @@ -179,9 +179,9 @@ __global__ __launch_bounds__(BlockSize) void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails) { - const unsigned int lid = hipThreadIdx_x; + const unsigned int lid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int block_offset = blockIdx.x * items_per_block; Type input[ItemsPerThread]; rocprim::block_load_direct_blocked(lid, device_input + block_offset, input); @@ -190,23 +190,23 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo FlagType head_flags[ItemsPerThread]; FlagType tail_flags[ItemsPerThread]; - if(hipBlockIdx_x % 4 == 0) + if(blockIdx.x % 4 == 0) { const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 1) + else if(blockIdx.x % 4 == 1) { const Type tile_predecessor_item = device_input[block_offset - 1]; const Type tile_successor_item = device_input[block_offset + items_per_block]; bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 2) + else if(blockIdx.x % 4 == 2) { const Type tile_predecessor_item = device_input[block_offset - 1]; bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType()); } - else if(hipBlockIdx_x % 4 == 3) + else if(blockIdx.x % 4 == 3) { bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, FlagOpType()); } @@ -236,11 +236,11 @@ auto test_block_discontinuity() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -348,11 +348,11 @@ auto test_block_discontinuity() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -460,11 +460,11 @@ auto test_block_discontinuity() >::type; using flag_type = FlagType; using flag_op_type = FlagOpType; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 20; - constexpr size_t grid_size = size / items_per_block; + static constexpr size_t grid_size = size / items_per_block; // Given block size not supported if(block_size > test_utils::get_max_block_size()) diff --git a/test/rocprim/test_block_exchange.cpp b/test/rocprim/test_block_exchange.cpp index b8f1cbec2..867dc59d1 100644 --- a/test/rocprim/test_block_exchange.cpp +++ b/test/rocprim/test_block_exchange.cpp @@ -49,8 +49,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void blocked_to_striped_kernel(Type* device_input, OutputType* device_output) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -73,8 +73,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -97,8 +97,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -121,8 +121,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -145,8 +145,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -171,8 +171,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks) { constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread); - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * ItemsPerBlock; Type input[ItemsPerThread]; OutputType output[ItemsPerThread]; @@ -199,9 +199,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -212,7 +212,7 @@ auto test_block_exchange() // Generate data std::vector input(size); std::vector expected(size); - std::vector output(size, 0); + std::vector output(size, (output_type)0); // Calculate input and expected results on host std::vector values(size); @@ -227,7 +227,7 @@ auto test_block_exchange() const size_t i0 = offset + ti * items_per_thread + ii; const size_t i1 = offset + ii * block_size + ti; input[i1] = values[i1]; - expected[i0] = values[i1]; + expected[i0] = static_cast(values[i1]); } } } @@ -283,9 +283,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -367,9 +367,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -463,9 +463,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -557,9 +557,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { @@ -659,9 +659,9 @@ auto test_block_exchange() { using type = T; using output_type = U; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) { diff --git a/test/rocprim/test_block_histogram.cpp b/test/rocprim/test_block_histogram.cpp index 1797be605..431a83c2a 100644 --- a/test/rocprim/test_block_histogram.cpp +++ b/test/rocprim/test_block_histogram.cpp @@ -84,8 +84,8 @@ __global__ __launch_bounds__(BlockSize) void histogram_kernel(T* device_output, T* device_output_bin) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; - unsigned int global_offset = hipBlockIdx_x * BinSize; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; + unsigned int global_offset = blockIdx.x * BinSize; __shared__ BinType hist[BinSize]; // load T in_out[ItemsPerThread]; @@ -97,12 +97,12 @@ void histogram_kernel(T* device_output, T* device_output_bin) rocprim::block_histogram bhist; bhist.histogram(in_out, hist); - #pragma unroll + ROCPRIM_UNROLL for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { - if(offset + hipThreadIdx_x < BinSize) + if(offset + threadIdx.x < BinSize) { - device_output_bin[global_offset + hipThreadIdx_x] = hist[offset + hipThreadIdx_x]; + device_output_bin[global_offset + threadIdx.x] = hist[offset + threadIdx.x]; global_offset += BlockSize; } } @@ -118,10 +118,10 @@ template< > void test_block_histogram_input_arrays() { - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr size_t bin = BlockSize; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr size_t bin = BlockSize; // Given block size not supported if(block_size > test_utils::get_max_block_size()) diff --git a/test/rocprim/test_block_load_store.cpp b/test/rocprim/test_block_load_store.cpp index 6ae326a54..9b48ffa5f 100644 --- a/test/rocprim/test_block_load_store.cpp +++ b/test/rocprim/test_block_load_store.cpp @@ -248,12 +248,12 @@ __global__ __launch_bounds__(BlockSize) void load_store_kernel(Type* device_input, Type* device_output) { - Type items[ItemsPerThread]; - unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; rocprim::block_load load; rocprim::block_store store; - load.load(device_input + offset, items); - store.store(device_output + offset, items); + load.load(device_input + offset, _items); + store.store(device_output + offset, _items); } TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) @@ -263,11 +263,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) HIP_CHECK(hipSetDevice(device_id)); using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; - constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 113; const auto grid_size = size / items_per_block; // Given block size not supported @@ -283,10 +283,10 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass) // Generate data std::vector input = test_utils::get_random_data(size, -100, 100, seed_value); - std::vector output(input.size(), 0); + std::vector output(input.size(), (Type)0); // Calculate expected results on host - std::vector expected(input.size(), 0); + std::vector expected(input.size(), (Type)0); for (size_t i = 0; i < 113; i++) { size_t block_offset = i * items_per_block; @@ -351,12 +351,12 @@ __global__ __launch_bounds__(BlockSize) void load_store_valid_kernel(Type* device_input, Type* device_output, size_t valid) { - Type items[ItemsPerThread]; - unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; rocprim::block_load load; rocprim::block_store store; - load.load(device_input + offset, items, valid); - store.store(device_output + offset, items, valid); + load.load(device_input + offset, _items, (unsigned int)valid); + store.store(device_output + offset, _items, (unsigned int)valid); } TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) @@ -366,11 +366,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) HIP_CHECK(hipSetDevice(device_id)); using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; - constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 113; const auto grid_size = size / items_per_block; // Given block size not supported @@ -388,10 +388,10 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid) // Generate data std::vector input = test_utils::get_random_data(size, -100, 100, seed_value); - std::vector output(input.size(), 0); + std::vector output(input.size(), (Type)0); // Calculate expected results on host - std::vector expected(input.size(), 0); + std::vector expected(input.size(), (Type)0); for (size_t i = 0; i < 113; i++) { size_t block_offset = i * items_per_block; @@ -462,18 +462,19 @@ template< rocprim::block_load_method LoadMethod, rocprim::block_store_method StoreMethod, unsigned int BlockSize, - unsigned int ItemsPerThread + unsigned int ItemsPerThread, + class Def > __global__ __launch_bounds__(BlockSize) -void load_store_valid_default_kernel(Type* device_input, Type* device_output, size_t valid, int _default) +void load_store_valid_default_kernel(Type* device_input, Type* device_output, size_t valid, Def _default) { - Type items[ItemsPerThread]; - unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread; + Type _items[ItemsPerThread]; + auto offset = blockIdx.x * BlockSize * ItemsPerThread; rocprim::block_load load; rocprim::block_store store; - load.load(device_input + offset, items, valid, _default); - store.store(device_output + offset, items); + load.load(device_input + offset, _items, (unsigned int)valid, _default); + store.store(device_output + offset, _items); } TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) @@ -483,11 +484,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) HIP_CHECK(hipSetDevice(device_id)); using Type = typename TestFixture::params::type; - constexpr size_t block_size = TestFixture::params::block_size; - constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; - constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; - const size_t items_per_thread = TestFixture::params::items_per_thread; - constexpr auto items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = TestFixture::params::block_size; + static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method; + static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method; + static constexpr size_t items_per_thread = TestFixture::params::items_per_thread; + static constexpr auto items_per_block = block_size * items_per_thread; const size_t size = items_per_block * 113; const auto grid_size = size / items_per_block; // Given block size not supported @@ -497,7 +498,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) } const size_t valid = items_per_thread + 1; - int _default = -1; + Type _default = (Type)-1; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -506,7 +507,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault) // Generate data std::vector input = test_utils::get_random_data(size, -100, 100, seed_value); - std::vector output(input.size(), 0); + std::vector output(input.size(), (Type)0); // Calculate expected results on host std::vector expected(input.size(), _default); diff --git a/test/rocprim/test_block_radix_sort.cpp b/test/rocprim/test_block_radix_sort.cpp index a79f3a52c..507d898f3 100644 --- a/test/rocprim/test_block_radix_sort.cpp +++ b/test/rocprim/test_block_radix_sort.cpp @@ -30,6 +30,7 @@ // required test headers #include "test_utils_types.hpp" +#include "test_sort_comparator.hpp" template class RocprimBlockRadixSort : public ::testing::Test { @@ -56,48 +57,6 @@ static constexpr unsigned int end_radix[n_sizes] = { TYPED_TEST_SUITE(RocprimBlockRadixSort, BlockParams); -template -struct key_comparator -{ - static_assert(rocprim::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); - - bool operator()(const Key& lhs, const Key& rhs) - { - auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; - return Descending ? (r < l) : (l < r); - } -}; - -template -struct key_comparator -{ - bool operator()(const Key& lhs, const Key& rhs) - { - return Descending ? (rhs < lhs) : (lhs < rhs); - } -}; - -template -struct key_comparator -{ - bool operator()(const rocprim::half& lhs, const rocprim::half& rhs) - { - // HIP's half doesn't have __host__ comparison operators, use floats instead - return key_comparator()(lhs, rhs); - } -}; - -template -struct key_value_comparator -{ - bool operator()(const std::pair& lhs, const std::pair& rhs) - { - return key_comparator()(lhs.first, rhs.first); - } -}; - template< unsigned int BlockSize, unsigned int ItemsPerThread, @@ -113,10 +72,15 @@ void sort_key_kernel( unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; key_type keys[ItemsPerThread]; +#ifdef __HIP_CPU_RT__ + // TODO: check if it's really neccessary + // Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory + std::memset(keys, 0, ItemsPerThread * sizeof(key_type)); +#endif rocprim::block_load_direct_blocked(lid, device_keys_output + block_offset, keys); rocprim::block_radix_sort bsort; @@ -158,8 +122,8 @@ void sort_key_value_kernel( unsigned int end_bit) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; - const unsigned int lid = hipThreadIdx_x; - const unsigned int block_offset = hipBlockIdx_x * items_per_block; + const unsigned int lid = threadIdx.x; + const unsigned int block_offset = blockIdx.x * items_per_block; key_type keys[ItemsPerThread]; value_type values[ItemsPerThread]; @@ -205,13 +169,13 @@ auto test_block_radix_sort() -> typename std::enable_if::type { using key_type = Key; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr bool descending = Descending; - constexpr bool to_striped = ToStriped; - constexpr unsigned int start_bit = (rocprim::is_unsigned::value == false) ? 0 : StartBit; - constexpr unsigned int end_bit = (rocprim::is_unsigned::value == false) ? sizeof(Key) * 8 : EndBit; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr bool descending = Descending; + static constexpr bool to_striped = ToStriped; + static constexpr unsigned int start_bit = (rocprim::is_unsigned::value == false) ? 0 : StartBit; + static constexpr unsigned int end_bit = (rocprim::is_unsigned::value == false) ? sizeof(Key) * 8 : EndBit; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -231,7 +195,7 @@ auto test_block_radix_sort() std::vector keys_output; if(rocprim::is_floating_point::value) { - keys_output = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000, seed_value); + keys_output = test_utils::get_random_data(size, -100, +100, seed_value); } else { @@ -306,13 +270,13 @@ auto test_block_radix_sort() { using key_type = Key; using value_type = Value; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; - constexpr bool descending = Descending; - constexpr bool to_striped = ToStriped; - constexpr unsigned int start_bit = (rocprim::is_unsigned::value == false) ? 0 : StartBit; - constexpr unsigned int end_bit = (rocprim::is_unsigned::value == false) ? sizeof(Key) * 8 : EndBit; - constexpr size_t items_per_block = block_size * items_per_thread; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; + static constexpr bool descending = Descending; + static constexpr bool to_striped = ToStriped; + static constexpr unsigned int start_bit = (rocprim::is_unsigned::value == false) ? 0 : StartBit; + static constexpr unsigned int end_bit = (rocprim::is_unsigned::value == false) ? sizeof(Key) * 8 : EndBit; + static constexpr size_t items_per_block = block_size * items_per_thread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -325,14 +289,14 @@ auto test_block_radix_sort() for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + seed_type seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data std::vector keys_output; if(rocprim::is_floating_point::value) { - keys_output = test_utils::get_random_data(size, (key_type)-1000, (key_type)+1000, seed_value); + keys_output = test_utils::get_random_data(size, -100, +100, seed_value); } else { @@ -340,7 +304,7 @@ auto test_block_radix_sort() size, std::numeric_limits::min(), std::numeric_limits::max(), - seed_index + seed_value ); } diff --git a/test/rocprim/test_block_reduce.cpp b/test/rocprim/test_block_reduce.cpp index 5ef032d3a..911e68b27 100644 --- a/test/rocprim/test_block_reduce.cpp +++ b/test/rocprim/test_block_reduce.cpp @@ -53,13 +53,13 @@ __global__ __launch_bounds__(BlockSize) void reduce_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; rocprim::block_reduce breduce; breduce.reduce(value, value, BinaryOp()); - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_reductions[hipBlockIdx_x] = value; + device_output_reductions[blockIdx.x] = value; } } @@ -140,15 +140,15 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector output = test_utils::get_random_data(size, 2, 50, seed_value); + std::vector output = test_utils::get_random_data(size, (T)2, (T)50, seed_value); std::vector output_reductions(size / block_size); // Calculate expected results on host - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; @@ -207,7 +207,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector output(size, 1); + std::vector output(size, (T)1); auto two_places = test_utils::get_random_data(size/32, 0, size-1, seed_value); for(auto i : two_places) { @@ -216,11 +216,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies) std::vector output_reductions(size / block_size); // Calculate expected results on host - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { - T value = 1; + T value = (T)1; for(size_t j = 0; j < block_size; j++) { auto idx = i * block_size + j; @@ -265,13 +265,13 @@ __global__ __launch_bounds__(BlockSize) void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; rocprim::block_reduce breduce; breduce.reduce(value, value, valid_items, BinaryOp()); - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_reductions[hipBlockIdx_x] = value; + device_output_reductions[blockIdx.x] = value; } } @@ -335,7 +335,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); - const unsigned int valid_items = test_utils::get_random_value(block_size - 10, block_size, seed_value); + const size_t valid_items = test_utils::get_random_value(block_size - 10, block_size, seed_value); // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -350,11 +350,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid) std::vector output_reductions(size / block_size); // Calculate expected results on host - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { - T value = 0; + T value = static_cast(0); for(size_t j = 0; j < valid_items; j++) { auto idx = i * block_size + j; @@ -410,7 +410,7 @@ __global__ __launch_bounds__(BlockSize) void reduce_array_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; // load T in_out[ItemsPerThread]; for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -422,9 +422,9 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions) T reduction; breduce.reduce(in_out, reduction, BinaryOp()); - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_reductions[hipBlockIdx_x] = reduction; + device_output_reductions[blockIdx.x] = reduction; } } @@ -438,9 +438,9 @@ template< void test_block_reduce_input_arrays() { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -461,14 +461,14 @@ void test_block_reduce_input_arrays() std::vector output = test_utils::get_random_data(size, 0, 100, seed_value); // Output reduce results - std::vector output_reductions(size / block_size, 0); + std::vector output_reductions(size / block_size, (T)0); // Calculate expected results on host - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < items_per_block; j++) { auto idx = i * items_per_block + j; diff --git a/test/rocprim/test_block_scan.cpp b/test/rocprim/test_block_scan.cpp index 6c3510ad1..4becee3c7 100644 --- a/test/rocprim/test_block_scan.cpp +++ b/test/rocprim/test_block_scan.cpp @@ -58,7 +58,7 @@ void scan_kernel(T* device_output, T* device_output_b, T init) { (void)init; (void)device_output_b; - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; rocprim::block_scan bscan; bscan.inclusive_scan(value, value); @@ -77,15 +77,15 @@ __launch_bounds__(BlockSize) void scan_kernel(T* device_output, T* device_output_b, T init) { (void)init; - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; T reduction; rocprim::block_scan bscan; bscan.inclusive_scan(value, value, reduction); device_output[index] = value; - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_b[hipBlockIdx_x] = reduction; + device_output_b[blockIdx.x] = reduction; } } @@ -100,7 +100,7 @@ __global__ __launch_bounds__(BlockSize) void scan_kernel(T* device_output, T* device_output_b, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T prefix_value = init; auto prefix_callback = [&prefix_value](T reduction) { @@ -116,9 +116,9 @@ void scan_kernel(T* device_output, T* device_output_b, T init) bscan_t().inclusive_scan(value, value, storage, prefix_callback, rocprim::plus()); device_output[index] = value; - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_b[hipBlockIdx_x] = prefix_value; + device_output_b[blockIdx.x] = prefix_value; } } @@ -134,7 +134,7 @@ __launch_bounds__(BlockSize) void scan_kernel(T* device_output, T* device_output_b, T init) { (void)device_output_b; - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; rocprim::block_scan bscan; bscan.exclusive_scan(value, value, init); @@ -152,15 +152,15 @@ __global__ __launch_bounds__(BlockSize) void scan_kernel(T* device_output, T* device_output_b, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T value = device_output[index]; T reduction; rocprim::block_scan bscan; bscan.exclusive_scan(value, value, init, reduction); device_output[index] = value; - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_b[hipBlockIdx_x] = reduction; + device_output_b[blockIdx.x] = reduction; } } @@ -175,7 +175,7 @@ __global__ __launch_bounds__(BlockSize) void scan_kernel(T* device_output, T* device_output_b, T init) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; T prefix_value = init; auto prefix_callback = [&prefix_value](T reduction) { @@ -191,9 +191,9 @@ void scan_kernel(T* device_output, T* device_output_b, T init) bscan_t().exclusive_scan(value, value, storage, prefix_callback, rocprim::plus()); device_output[index] = value; - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_b[hipBlockIdx_x] = prefix_value; + device_output_b[blockIdx.x] = prefix_value; } } @@ -287,7 +287,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan) std::vector output2 = output; // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -347,8 +347,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce) std::vector output_reductions(size / block_size); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -418,8 +418,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback) T block_prefix = test_utils::get_random_value(0, 5, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_block_prefixes(output_block_prefixes.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -489,7 +489,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan) const T init = test_utils::get_random_value(0, 5, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -552,8 +552,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce) std::vector output_reductions(size / block_size); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -630,8 +630,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback) T block_prefix = test_utils::get_random_value(0, 5, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_block_prefixes(output_block_prefixes.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / block_size; i++) { @@ -702,7 +702,7 @@ __global__ __launch_bounds__(BlockSize) void inclusive_scan_array_kernel(T* device_output) { - const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; // load T in_out[ItemsPerThread]; @@ -732,7 +732,7 @@ __global__ __launch_bounds__(BlockSize) void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions) { - const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread; // load T in_out[ItemsPerThread]; @@ -751,9 +751,9 @@ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc device_output[index + j] = in_out[j]; } - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_reductions[hipBlockIdx_x] = reduction; + device_output_reductions[blockIdx.x] = reduction; } } @@ -768,7 +768,7 @@ __global__ __launch_bounds__(BlockSize) void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; T prefix_value = block_prefix; auto prefix_callback = [&prefix_value](T reduction) { @@ -794,9 +794,9 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out device_output[index + j] = in_out[j]; } - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_bp[hipBlockIdx_x] = prefix_value; + device_output_bp[blockIdx.x] = prefix_value; } } @@ -811,7 +811,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_array_kernel(T* device_output, T init) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; // load T in_out[ItemsPerThread]; for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -840,7 +840,7 @@ __global__ __launch_bounds__(BlockSize) void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; // load T in_out[ItemsPerThread]; for(unsigned int j = 0; j < ItemsPerThread; j++) @@ -858,9 +858,9 @@ void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc device_output[index + j] = in_out[j]; } - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_reductions[hipBlockIdx_x] = reduction; + device_output_reductions[blockIdx.x] = reduction; } } @@ -879,7 +879,7 @@ void exclusive_scan_prefix_callback_array_kernel( T block_prefix ) { - const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; + const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; T prefix_value = block_prefix; auto prefix_callback = [&prefix_value](T reduction) { @@ -905,9 +905,9 @@ void exclusive_scan_prefix_callback_array_kernel( device_output[index + j] = in_out[j]; } - if(hipThreadIdx_x == 0) + if(threadIdx.x == 0) { - device_output_bp[hipBlockIdx_x] = prefix_value; + device_output_bp[blockIdx.x] = prefix_value; } } @@ -923,9 +923,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -946,7 +946,7 @@ auto test_block_scan_input_arrays() std::vector output = test_utils::get_random_data(size, 2, 100, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { @@ -1006,9 +1006,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1029,11 +1029,11 @@ auto test_block_scan_input_arrays() std::vector output = test_utils::get_random_data(size, 2, 100, seed_value); // Output reduce results - std::vector output_reductions(size / block_size, 0); + std::vector output_reductions(size / block_size, (T)0); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { @@ -1120,9 +1120,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1141,12 +1141,12 @@ auto test_block_scan_input_arrays() // Generate data std::vector output = test_utils::get_random_data(size, 2, 100, seed_value); - std::vector output_block_prefixes(size / items_per_block, 0); + std::vector output_block_prefixes(size / items_per_block, (T)0); T block_prefix = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_block_prefixes(output_block_prefixes.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { @@ -1236,9 +1236,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1260,7 +1260,7 @@ auto test_block_scan_input_arrays() const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { @@ -1322,9 +1322,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1349,8 +1349,8 @@ auto test_block_scan_input_arrays() const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { @@ -1433,9 +1433,9 @@ auto test_block_scan_input_arrays() -> typename std::enable_if::type { using binary_op_type = typename std::conditional::value, test_utils::half_maximum, rocprim::maximum>::type; - constexpr auto algorithm = Algorithm; - constexpr size_t block_size = BlockSize; - constexpr size_t items_per_thread = ItemsPerThread; + static constexpr auto algorithm = Algorithm; + static constexpr size_t block_size = BlockSize; + static constexpr size_t items_per_thread = ItemsPerThread; // Given block size not supported if(block_size > test_utils::get_max_block_size()) @@ -1458,8 +1458,8 @@ auto test_block_scan_input_arrays() T block_prefix = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host - std::vector expected(output.size(), 0); - std::vector expected_block_prefixes(output_block_prefixes.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_block_prefixes(output_block_prefixes.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / items_per_block; i++) { diff --git a/test/rocprim/test_block_shuffle.cpp b/test/rocprim/test_block_shuffle.cpp index 55edd615b..bd8c2ddf6 100644 --- a/test/rocprim/test_block_shuffle.cpp +++ b/test/rocprim/test_block_shuffle.cpp @@ -48,7 +48,7 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void shuffle_offset_kernel(T* device_input, T* device_output, int distance) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; rocprim::block_shuffle b_shuffle; b_shuffle.offset(device_input[index],device_output[index],distance); } @@ -60,13 +60,13 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockOffset) HIP_CHECK(hipSetDevice(device_id)); using type = typename TestFixture::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 11; - const size_t grid_size = size / block_size; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 11; + static constexpr size_t grid_size = size / block_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - int distance = (rand()%min(10,block_size/2))-min(10,block_size/2); + int distance = (rand()%std::min(10,block_size/2))-std::min(10,block_size/2); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "< input_data = test_utils::get_random_data(size, -100, 100, seed_value); @@ -76,8 +76,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockOffset) type * device_input; type * device_output; - HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type))); - HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), input_data.size() * sizeof(type))); HIP_CHECK( hipMemcpy( @@ -132,7 +132,7 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void shuffle_rotate_kernel(T* device_input, T* device_output, int distance) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; rocprim::block_shuffle b_shuffle; b_shuffle.rotate(device_input[index],device_output[index],distance); } @@ -144,13 +144,13 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockRotate) HIP_CHECK(hipSetDevice(device_id)); using type = typename TestFixture::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 11; - const size_t grid_size = size / block_size; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 11; + static constexpr size_t grid_size = size / block_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; - int distance = (rand()%min(5,block_size/2)); + int distance = (rand()%std::min(5,block_size/2)); SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "< input_data = test_utils::get_random_data(size, -100, 100, seed_value); @@ -160,8 +160,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockRotate) type * device_input; type * device_output; - HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type))); - HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), input_data.size() * sizeof(type))); HIP_CHECK( hipMemcpy( @@ -216,7 +216,7 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void shuffle_up_kernel(T (*device_input), T (*device_output)) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; rocprim::block_shuffle b_shuffle; b_shuffle.template up(reinterpret_cast(device_input[index*ItemsPerThread]),reinterpret_cast(device_output[index*ItemsPerThread])); } @@ -228,10 +228,10 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockUp) HIP_CHECK(hipSetDevice(device_id)); using type = typename TestFixture::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 11; - const size_t grid_size = size / block_size; - constexpr unsigned int ItemsPerThread = 128; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 11; + static constexpr size_t grid_size = size / block_size; + static constexpr unsigned int ItemsPerThread = 128; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; @@ -248,8 +248,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockUp) type * device_output; - HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type))); - HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), input_data.size() * sizeof(type))); @@ -310,7 +310,7 @@ __global__ __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU) void shuffle_down_kernel(T (*device_input), T (*device_output)) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; rocprim::block_shuffle b_shuffle; b_shuffle.template down(reinterpret_cast(device_input[index*ItemsPerThread]),reinterpret_cast(device_output[index*ItemsPerThread])); } @@ -322,10 +322,10 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockDown) HIP_CHECK(hipSetDevice(device_id)); using type = typename TestFixture::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 11; - const size_t grid_size = size / block_size; - constexpr unsigned int ItemsPerThread = 128; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 11; + static constexpr size_t grid_size = size / block_size; + static constexpr unsigned int ItemsPerThread = 128; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; @@ -343,8 +343,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockDown) type * device_output; - HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type))); - HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input_data.size() * sizeof(type))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), input_data.size() * sizeof(type))); diff --git a/test/rocprim/test_block_sort.cpp b/test/rocprim/test_block_sort.cpp index bb4853ac2..712c15439 100644 --- a/test/rocprim/test_block_sort.cpp +++ b/test/rocprim/test_block_sort.cpp @@ -48,7 +48,7 @@ __global__ __launch_bounds__(BlockSize) void sort_key_kernel(key_type * device_key_output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; key_type key = device_key_output[index]; rocprim::block_sort bsort; bsort.sort(key); @@ -63,7 +63,7 @@ TYPED_TEST(RocprimBlockSortTests, SortKey) using key_type = typename TestFixture::key_type; using binary_op_type = typename std::conditional::value, test_utils::half_less, rocprim::less>::type; - const size_t block_size = TestFixture::block_size; + static constexpr size_t block_size = TestFixture::block_size; const size_t size = block_size * 1134; const size_t grid_size = size / block_size; @@ -144,7 +144,7 @@ __global__ __launch_bounds__(BlockSize) void sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; key_type key = device_key_output[index]; value_type value = device_value_output[index]; rocprim::block_sort bsort; @@ -163,9 +163,9 @@ TYPED_TEST(RocprimBlockSortTests, SortKeyValue) using value_type = typename TestFixture::value_type; using value_op_type = typename std::conditional::value, test_utils::half_less, rocprim::less>::type; using eq_op_type = typename std::conditional::value, test_utils::half_equal_to, rocprim::equal_to>::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 1134; + static constexpr size_t grid_size = size / block_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -288,7 +288,7 @@ __global__ __launch_bounds__(BlockSize) void custom_sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output) { - const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x; key_type key = device_key_output[index]; value_type value = device_value_output[index]; rocprim::block_sort bsort; @@ -307,9 +307,9 @@ TYPED_TEST(RocprimBlockSortTests, CustomSortKeyValue) using value_type = typename TestFixture::value_type; using value_op_type = typename std::conditional::value, test_utils::half_less, rocprim::less>::type; using eq_op_type = typename std::conditional::value, test_utils::half_equal_to, rocprim::equal_to>::type; - const size_t block_size = TestFixture::block_size; - const size_t size = block_size * 1134; - const size_t grid_size = size / block_size; + static constexpr size_t block_size = TestFixture::block_size; + static constexpr size_t size = block_size * 1134; + static constexpr size_t grid_size = size / block_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { diff --git a/test/rocprim/test_constant_iterator.cpp b/test/rocprim/test_constant_iterator.cpp index e3e970b3e..45957eaef 100644 --- a/test/rocprim/test_constant_iterator.cpp +++ b/test/rocprim/test_constant_iterator.cpp @@ -123,7 +123,8 @@ TYPED_TEST(RocprimConstantIteratorTests, Transform) } else if(std::is_floating_point::value) { - auto tolerance = std::max(std::abs(0.1f * expected[i]), T(test_utils::precision_threshold::percentage)); + float percentage = test_utils::precision_threshold::percentage; + auto tolerance = std::max(std::abs(0.1f * (float)expected[i]), (float)percentage); ASSERT_NEAR(output[i], expected[i], tolerance) << "where index = " << i; } } diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp index 22449b1c7..52151028f 100644 --- a/test/rocprim/test_device_binary_search.cpp +++ b/test/rocprim/test_device_binary_search.cpp @@ -106,7 +106,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); + const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning const size_t d = haystack_size / 100; // Generate data @@ -216,7 +216,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { - unsigned int seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + seed_type seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); for(size_t size : get_sizes(seed_value)) @@ -228,7 +228,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) } SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); + const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning const size_t d = haystack_size / 100; // Generate data @@ -351,7 +351,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t haystack_size = size; - const size_t needles_size = std::sqrt(size); + const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning const size_t d = haystack_size / 100; // Generate data diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp index ff535e283..6a01cb639 100644 --- a/test/rocprim/test_device_histogram.cpp +++ b/test/rocprim/test_device_histogram.cpp @@ -221,7 +221,7 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) const level_type s = static_cast(sample); if(s >= lower_level && s < upper_level) { - const int bin = (s - lower_level) / scale; + const level_type bin = (s - lower_level) / scale; histogram_expected[bin]++; } } @@ -235,7 +235,7 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) HIP_CHECK( rocprim::histogram_even( nullptr, temporary_storage_bytes, - d_input, columns, + d_input, static_cast(columns), d_histogram, bins + 1, lower_level, upper_level, stream, debug_synchronous diff --git a/test/rocprim/test_device_merge.cpp b/test/rocprim/test_device_merge.cpp index b60972417..37d87d887 100644 --- a/test/rocprim/test_device_merge.cpp +++ b/test/rocprim/test_device_merge.cpp @@ -130,7 +130,7 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) std::vector keys_input2 = test_utils::get_random_data(size2, 0, size2, seed_value); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); - std::vector keys_output(size1 + size2, 0); + std::vector keys_output(size1 + size2, (key_type)0); // Calculate expected results on host std::vector expected(keys_output.size()); @@ -276,8 +276,8 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) std::vector values_input2(size2); std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); - std::vector keys_output(size1 + size2, 0); - std::vector values_output(size1 + size2, 0); + std::vector keys_output(size1 + size2, (key_type)0); + std::vector values_output(size1 + size2, (value_type)0); // Calculate expected results on host std::vector vector1(size1); diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp index 6c981921a..531ef7baa 100644 --- a/test/rocprim/test_device_partition.cpp +++ b/test/rocprim/test_device_partition.cpp @@ -396,11 +396,11 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) { if(select_op(input[i])) { - expected_selected.push_back(input[i]); + expected_selected.push_back((U)input[i]); } else { - expected_rejected.push_back(input[i]); + expected_rejected.push_back((U)input[i]); } } std::reverse(expected_rejected.begin(), expected_rejected.end()); diff --git a/test/rocprim/test_device_radix_sort.cpp b/test/rocprim/test_device_radix_sort.cpp index 238aec2d8..3cf70fdf8 100644 --- a/test/rocprim/test_device_radix_sort.cpp +++ b/test/rocprim/test_device_radix_sort.cpp @@ -27,6 +27,7 @@ // required test headers #include "test_utils_types.hpp" +#include "test_sort_comparator.hpp" template< class Key, @@ -84,48 +85,6 @@ typedef ::testing::Types< TYPED_TEST_SUITE(RocprimDeviceRadixSort, Params); -template -struct key_comparator -{ - static_assert(rocprim::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); - - bool operator()(const Key& lhs, const Key& rhs) - { - auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; - return Descending ? (r < l) : (l < r); - } -}; - -template -struct key_comparator -{ - bool operator()(const Key& lhs, const Key& rhs) - { - return Descending ? (rhs < lhs) : (lhs < rhs); - } -}; - -template -struct key_comparator -{ - bool operator()(const rocprim::half& lhs, const rocprim::half& rhs) - { - // HIP's half doesn't have __host__ comparison operators, use floats instead - return key_comparator()(lhs, rhs); - } -}; - -template -struct key_value_comparator -{ - bool operator()(const std::pair& lhs, const std::pair& rhs) - { - return key_comparator()(lhs.first, rhs.first); - } -}; - std::vector get_sizes(int seed_value) { std::vector sizes = { 0, 1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 23) - 76543 }; diff --git a/test/rocprim/test_device_reduce.cpp b/test/rocprim/test_device_reduce.cpp index 95c0902d9..836098029 100644 --- a/test/rocprim/test_device_reduce.cpp +++ b/test/rocprim/test_device_reduce.cpp @@ -174,7 +174,7 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce) // Generate data std::vector input = test_utils::get_random_data(size, 1, 100, seed_value); - std::vector output(1, 0); + std::vector output(1, (U)0); // reduce function binary_op_type plus_op; @@ -278,7 +278,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) // Generate data std::vector input = test_utils::get_random_data(size, 1, 100, seed_value); - std::vector output(1, 0); + std::vector output(1, (U)0); T * d_input; U * d_output; @@ -415,8 +415,8 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) std::vector input(size); for (size_t i = 0; i < size; i++) { - input[i].key = i; - input[i].value = test_utils::get_random_value(1, 100, seed_value); + input[i].key = (int)i; + input[i].value = test_utils::get_random_data(1, 1, 100, seed_value)[0]; } std::vector output(1); diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp index bcfeedfdc..cb5cf38a1 100644 --- a/test/rocprim/test_device_reduce_by_key.cpp +++ b/test/rocprim/test_device_reduce_by_key.cpp @@ -132,7 +132,14 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) using key_distribution_type = typename std::conditional< std::is_floating_point::value, std::uniform_real_distribution, - std::uniform_int_distribution + typename std::conditional< + test_utils::is_valid_for_int_distribution::value, + std::uniform_int_distribution, + typename std::conditional::value, + std::uniform_int_distribution, + std::uniform_int_distribution + >::type + >::type >::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; diff --git a/test/rocprim/test_device_run_length_encode.cpp b/test/rocprim/test_device_run_length_encode.cpp index c3125a15d..986167478 100644 --- a/test/rocprim/test_device_run_length_encode.cpp +++ b/test/rocprim/test_device_run_length_encode.cpp @@ -104,7 +104,14 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) using key_distribution_type = typename std::conditional< std::is_floating_point::value, std::uniform_real_distribution, - std::uniform_int_distribution + typename std::conditional< + test_utils::is_valid_for_int_distribution::value, + std::uniform_int_distribution, + typename std::conditional::value, + std::uniform_int_distribution, + std::uniform_int_distribution + >::type + >::type >::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; @@ -159,7 +166,7 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode) unique_expected.push_back(current_key); runs_count_expected++; - counts_expected.push_back(key_count); + counts_expected.push_back(static_cast(key_count)); offset += key_count; } @@ -268,7 +275,14 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) using key_distribution_type = typename std::conditional< std::is_floating_point::value, std::uniform_real_distribution, - std::uniform_int_distribution + typename std::conditional< + test_utils::is_valid_for_int_distribution::value, + std::uniform_int_distribution, + typename std::conditional::value, + std::uniform_int_distribution, + std::uniform_int_distribution + >::type + >::type >::type; constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator; @@ -333,9 +347,9 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns) if(key_count > 1) { - offsets_expected.push_back(offset); + offsets_expected.push_back(static_cast(offset)); runs_count_expected++; - counts_expected.push_back(key_count); + counts_expected.push_back(static_cast(key_count)); } offset += key_count; diff --git a/test/rocprim/test_device_segmented_radix_sort.cpp b/test/rocprim/test_device_segmented_radix_sort.cpp index 513e81812..6a462dec0 100644 --- a/test/rocprim/test_device_segmented_radix_sort.cpp +++ b/test/rocprim/test_device_segmented_radix_sort.cpp @@ -27,6 +27,7 @@ // required test headers #include "test_utils_types.hpp" +#include "test_sort_comparator.hpp" template< class Key, @@ -83,48 +84,6 @@ typedef ::testing::Types< TYPED_TEST_SUITE(RocprimDeviceSegmentedRadixSort, Params); -template -struct key_comparator -{ - static_assert(rocprim::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); - - bool operator()(const Key& lhs, const Key& rhs) - { - auto mask = (1ull << (EndBit - StartBit)) - 1; - auto l = (static_cast(lhs) >> StartBit) & mask; - auto r = (static_cast(rhs) >> StartBit) & mask; - return Descending ? (r < l) : (l < r); - } -}; - -template -struct key_comparator -{ - bool operator()(const Key& lhs, const Key& rhs) - { - return Descending ? (rhs < lhs) : (lhs < rhs); - } -}; - -template -struct key_comparator -{ - bool operator()(const rocprim::half& lhs, const rocprim::half& rhs) - { - // HIP's half doesn't have __host__ comparison operators, use floats instead - return key_comparator()(lhs, rhs); - } -}; - -template -struct key_value_comparator -{ - bool operator()(const std::pair& lhs, const std::pair& rhs) - { - return key_comparator()(lhs.first, rhs.first); - } -}; - std::vector get_sizes(int seed_value) { std::vector sizes = { @@ -146,9 +105,9 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys) HIP_CHECK(hipSetDevice(device_id)); using key_type = typename TestFixture::params::key_type; - constexpr bool descending = TestFixture::params::descending; - constexpr unsigned int start_bit = TestFixture::params::start_bit; - constexpr unsigned int end_bit = TestFixture::params::end_bit; + static constexpr bool descending = TestFixture::params::descending; + static constexpr unsigned int start_bit = TestFixture::params::start_bit; + static constexpr unsigned int end_bit = TestFixture::params::end_bit; using offset_type = unsigned int; diff --git a/test/rocprim/test_device_segmented_reduce.cpp b/test/rocprim/test_device_segmented_reduce.cpp index 3feda27ff..7f6f6b792 100644 --- a/test/rocprim/test_device_segmented_reduce.cpp +++ b/test/rocprim/test_device_segmented_reduce.cpp @@ -109,7 +109,7 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) using result_type = output_type; using offset_type = unsigned int; - const input_type init = TestFixture::params::init; + const input_type init = (input_type)TestFixture::params::init; const bool debug_synchronous = false; reduce_op_type reduce_op; diff --git a/test/rocprim/test_device_select.cpp b/test/rocprim/test_device_select.cpp index 57a1e18de..2c5cab4a3 100644 --- a/test/rocprim/test_device_select.cpp +++ b/test/rocprim/test_device_select.cpp @@ -405,7 +405,7 @@ TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput) rocprim::unique( nullptr, temp_storage_size_bytes, - rocprim::make_constant_iterator(123), + rocprim::make_constant_iterator((T)123), rocprim::make_discard_iterator(), d_selected_count_output, 0, @@ -423,7 +423,7 @@ TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput) rocprim::unique( d_temp_storage, temp_storage_size_bytes, - rocprim::make_constant_iterator(123), + rocprim::make_constant_iterator((T)123), rocprim::make_discard_iterator(), d_selected_count_output, 0, @@ -658,10 +658,10 @@ TEST(RocprimDeviceSelectTests, UniqueGuardedOperator) F * d_flag; U * d_output; unsigned int * d_selected_count_output; - HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); - HIP_CHECK(hipMalloc(&d_flag, input_flag.size() * sizeof(F))); - HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U))); - HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_flag), input_flag.size() * sizeof(F))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(U))); + HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), diff --git a/test/rocprim/test_device_transform.cpp b/test/rocprim/test_device_transform.cpp index 34f86c3f4..a91c69087 100644 --- a/test/rocprim/test_device_transform.cpp +++ b/test/rocprim/test_device_transform.cpp @@ -141,7 +141,7 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) // Generate data std::vector input = test_utils::get_random_data(size, 1, 100, seed_value); - std::vector output(input.size(), 0); + std::vector output(input.size(), (U)0); T * d_input; U * d_output; @@ -247,7 +247,7 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) // Generate data std::vector input1 = test_utils::get_random_data(size, 1, 100, seed_value); std::vector input2 = test_utils::get_random_data(size, 1, 100, seed_value); - std::vector output(input1.size(), 0); + std::vector output(input1.size(), (U)0); T1 * d_input1; T2 * d_input2; diff --git a/test/rocprim/test_intrinsics.cpp b/test/rocprim/test_intrinsics.cpp index a33e2b681..4d5391231 100644 --- a/test/rocprim/test_intrinsics.cpp +++ b/test/rocprim/test_intrinsics.cpp @@ -52,7 +52,7 @@ inline bool operator==(const custom_notaligned& lhs, } // Custom structure aligned to 16 bytes -struct custom_16aligned +struct alignas(16) custom_16aligned { int i; unsigned int u; @@ -62,7 +62,7 @@ struct custom_16aligned custom_16aligned() {}; ROCPRIM_HOST_DEVICE ~custom_16aligned() {}; -} __attribute__((aligned(16))); +}; inline ROCPRIM_HOST_DEVICE bool operator==(const custom_16aligned& lhs, const custom_16aligned& rhs) @@ -98,7 +98,7 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void shuffle_up_kernel(T* data, unsigned int delta, unsigned int width) { - const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; T value = data[index]; value = rocprim::warp_shuffle_up(value, delta, width); data[index] = value; @@ -201,7 +201,7 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void shuffle_down_kernel(T* data, unsigned int delta, unsigned int width) { - const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; T value = data[index]; value = rocprim::warp_shuffle_down(value, delta, width); data[index] = value; @@ -304,10 +304,10 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void shuffle_index_kernel(T* data, int* src_lanes, unsigned int width) { - const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; T value = data[index]; value = rocprim::warp_shuffle( - value, src_lanes[hipThreadIdx_x/width], width + value, src_lanes[threadIdx.x/width], width ); data[index] = value; } @@ -436,10 +436,10 @@ TEST(RocprimIntrinsicsTests, ShuffleUpCustomStruct) std::vector output(input.size()); for(size_t i = 0; i < 4 * input.size(); i+=4) { - input[i/4].i = random_data[i]; + input[i/4].i = (short)random_data[i]; input[i/4].d = random_data[i+1]; - input[i/4].f = random_data[i+2]; - input[i/4].u = random_data[i+3]; + input[i/4].f = (float)random_data[i+2]; + input[i/4].u = (unsigned int)random_data[i+3]; } T* device_data; @@ -536,9 +536,9 @@ TEST(RocprimIntrinsicsTests, ShuffleUpCustomAlignedStruct) std::vector output(input.size()); for(size_t i = 0; i < 3 * input.size(); i+=3) { - input[i/3].i = random_data[i]; - input[i/3].u = random_data[i+1]; - input[i/3].f = random_data[i+2]; + input[i/3].i = (int)random_data[i]; + input[i/3].u = (unsigned int)random_data[i+1]; + input[i/3].f = (float)random_data[i+2]; } T* device_data; diff --git a/test/rocprim/test_seed.hpp b/test/rocprim/test_seed.hpp index c968d94e1..f1eeb10d5 100644 --- a/test/rocprim/test_seed.hpp +++ b/test/rocprim/test_seed.hpp @@ -21,8 +21,11 @@ #ifndef TEST_SEED_HPP_ #define TEST_SEED_HPP_ -static constexpr int random_seeds_count = 1; -static constexpr unsigned int seeds [] = {0, 1997132004}; +using engine_type = std::default_random_engine; +using seed_type = typename engine_type::result_type; + +static constexpr size_t random_seeds_count = 1; +static constexpr seed_type seeds [] = {0, 1997132004}; static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); #endif // TEST_SEED_HPP_ diff --git a/test/rocprim/test_sort_comparator.hpp b/test/rocprim/test_sort_comparator.hpp new file mode 100644 index 000000000..118faa7e1 --- /dev/null +++ b/test/rocprim/test_sort_comparator.hpp @@ -0,0 +1,177 @@ +// MIT License +// +// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef TEST_SORT_COMPARATOR_HPP_ +#define TEST_SORT_COMPARATOR_HPP_ + +#include + +// Original C++17 logic +// +//template +//struct key_comparator +//{ +// bool operator()(const Key& lhs, const Key& rhs) +// { +// if constexpr (rocprim::is_unsigned::value) +// { +// if constexpr (StartBit == 0 && (EndBit == sizeof(Key) * 8)) +// { +// return Descending ? (rhs < lhs) : (lhs < rhs); +// } +// else +// { +// auto mask = (1ull << (EndBit - StartBit)) - 1; +// auto l = (static_cast(lhs) >> StartBit) & mask; +// auto r = (static_cast(rhs) >> StartBit) & mask; +// return Descending ? (r < l) : (l < r); +// } +// } +// else +// { +// if constexpr (std::is_same_v) +// { +// float l = static_cast(lhs); +// float r = static_cast(rhs); +// return Descending ? (r < l) : (l < r); +// } +// else +// { +// return Descending ? (rhs < lhs) : (lhs < rhs); +// } +// } +// } +//}; + +// Faulty C++14 backported logic (consider fixing) +// +//template +//bool generic_key_compare(const Key& lhs, const Key& rhs) { return Descending ? (rhs < lhs) : (lhs < rhs); } +// +//template +//auto discriminate_bits(const Key& lhs, const Key& rhs) -> typename std::enable_if::type +//{ +// // TODO: pick adequately sized integral type (instead of 1ull) based on Key. +// // Needed to safely silence "'argument': conversion from 'unsigned __int64' to 'const Key', possible loss of data" +// auto mask = (1ull << (EndBit - StartBit)) - 1; +// auto l = (static_cast(lhs) >> StartBit) & mask; +// auto r = (static_cast(rhs) >> StartBit) & mask; +// return generic_key_compare(l, r); +//} +// +//template +//auto discriminate_bits(const Key& lhs, const Key& rhs) -> typename std::enable_if::type +//{ +// return generic_key_compare(lhs, rhs); +//} +// +//template +//auto discriminate_half(const Key& lhs, const Key& rhs) -> typename std::enable_if::value, bool>::type +//{ +// // HIP's half doesn't have __host__ comparison operators, use floats instead +// return generic_key_compare((float)lhs, (float)rhs); +//} +// +//template +//auto discriminate_half(const Key& lhs, const Key& rhs) -> typename std::enable_if::value, bool>::type +//{ +// return generic_key_compare(lhs, rhs); +//} +// +//template +//auto discriminate_unsigned(const Key& lhs, const Key& rhs) -> typename std::enable_if::value, bool>::type +//{ +// return discriminate_bits(lhs, rhs); +//} +// +//template +//auto discriminate_unsigned(const Key& lhs, const Key& rhs) -> typename std::enable_if::value, bool>::type +//{ +// return discriminate_half(lhs, rhs); +//} +// +//template +//struct key_comparator +//{ +// bool operator()(const Key& lhs, const Key& rhs) +// { +// return discriminate_unsigned(lhs, rhs); +// } +//}; +// +//template +//struct key_value_comparator +//{ +// bool operator()(const std::pair& lhs, const std::pair& rhs) +// { +// return key_comparator()(lhs.first, rhs.first); +// } +//}; + +// Original code with ISO-conforming overload control +// +// NOTE: ShiftLess helper is needed, because partial specializations cannot refer to the free template args. +// See: https://stackoverflow.com/questions/2615905/c-template-nontype-parameter-arithmetic + +template +struct key_comparator +{ + static_assert(rocprim::is_unsigned::value, "Test supports start and end bits only for unsigned integers"); + + bool operator()(const Key& lhs, const Key& rhs) + { + auto mask = (1ull << (EndBit - StartBit)) - 1; + auto l = (static_cast(lhs) >> StartBit) & mask; + auto r = (static_cast(rhs) >> StartBit) & mask; + return Descending ? (r < l) : (l < r); + } +}; + +template +struct key_comparator +{ + bool operator()(const Key& lhs, const Key& rhs) + { + return Descending ? (rhs < lhs) : (lhs < rhs); + } +}; + +template +struct key_comparator +{ + bool operator()(const rocprim::half& lhs, const rocprim::half& rhs) + { + // HIP's half doesn't have __host__ comparison operators, use floats instead + return key_comparator()(lhs, rhs); + } +}; + +template +struct key_value_comparator +{ + bool operator()(const std::pair& lhs, const std::pair& rhs) + { + return key_comparator()(lhs.first, rhs.first); + } +}; + +#endif // TEST_SORT_COMPARATOR_HPP_ diff --git a/test/rocprim/test_texture_cache_iterator.cpp b/test/rocprim/test_texture_cache_iterator.cpp index 0fa927b68..3613eb0bc 100644 --- a/test/rocprim/test_texture_cache_iterator.cpp +++ b/test/rocprim/test_texture_cache_iterator.cpp @@ -89,7 +89,7 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform) for(size_t i = 0; i < size; i++) { - input[i] = T(test_utils::get_random_value(1, 200, seed_value)); + input[i] = test_utils::get_random_value(1, 200, seed_value); } std::vector output(size); diff --git a/test/rocprim/test_thread.cpp b/test/rocprim/test_thread.cpp index b32504cd1..72dd5b557 100644 --- a/test/rocprim/test_thread.cpp +++ b/test/rocprim/test_thread.cpp @@ -86,10 +86,10 @@ void flat_id_kernel(unsigned int* device_output) TYPED_TEST(RocprimThreadTests, FlatBlockThreadID) { using Type = unsigned int; - constexpr size_t block_size_x = TestFixture::params::block_size_x; - constexpr size_t block_size_y = TestFixture::params::block_size_y; - constexpr size_t block_size_z = TestFixture::params::block_size_z; - constexpr size_t block_size = block_size_x * block_size_y * block_size_z; + static constexpr size_t block_size_x = TestFixture::params::block_size_x; + static constexpr size_t block_size_y = TestFixture::params::block_size_y; + static constexpr size_t block_size_z = TestFixture::params::block_size_z; + static constexpr size_t block_size = block_size_x * block_size_y * block_size_z; // Given block size not supported if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0) { @@ -151,7 +151,7 @@ __launch_bounds__(1024) void block_id_kernel(unsigned int* device_output) { unsigned int block_id = rocprim::flat_block_id(); - if(hipThreadIdx_x) + if(threadIdx.x) { device_output[block_id] = block_id; } @@ -160,10 +160,10 @@ void block_id_kernel(unsigned int* device_output) TYPED_TEST(RocprimThreadTests, FlatBlockID) { using Type = unsigned int; - constexpr size_t block_size_x = TestFixture::params::block_size_x; - constexpr size_t block_size_y = TestFixture::params::block_size_y; - constexpr size_t block_size_z = TestFixture::params::block_size_z; - constexpr size_t block_size = block_size_x * block_size_y * block_size_z; + static constexpr size_t block_size_x = TestFixture::params::block_size_x; + static constexpr size_t block_size_y = TestFixture::params::block_size_y; + static constexpr size_t block_size_z = TestFixture::params::block_size_z; + static constexpr size_t block_size = block_size_x * block_size_y * block_size_z; const size_t size = block_size * block_size; const auto grid_size = size / block_size; diff --git a/test/rocprim/test_thread_algos.cpp b/test/rocprim/test_thread_algos.cpp index edcfe647d..48b14109a 100644 --- a/test/rocprim/test_thread_algos.cpp +++ b/test/rocprim/test_thread_algos.cpp @@ -65,16 +65,16 @@ template __global__ void thread_load_kernel(Type* volatile const device_input, Type* device_output) { - size_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + size_t index = blockIdx.x * blockDim.x + threadIdx.x; device_output[index] = rocprim::thread_load(device_input + index); } TYPED_TEST(RocprimThreadOperationTests, Load) { using T = typename TestFixture::type; - constexpr uint32_t block_size = 256; - constexpr uint32_t grid_size = 128; - constexpr uint32_t size = block_size * grid_size; + static constexpr uint32_t block_size = 256; + static constexpr uint32_t grid_size = 128; + static constexpr uint32_t size = block_size * grid_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -90,9 +90,9 @@ TYPED_TEST(RocprimThreadOperationTests, Load) // Preparing device T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(T))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), output.size() * sizeof(T))); HIP_CHECK( hipMemcpy( @@ -102,7 +102,11 @@ TYPED_TEST(RocprimThreadOperationTests, Load) ) ); - thread_load_kernel<<>>(device_input, device_output); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(thread_load_kernel), + grid_size, block_size, 0, 0, + device_input, device_output + ); // Reading results back HIP_CHECK( @@ -128,16 +132,16 @@ template __global__ void thread_store_kernel(Type* const device_input, Type* device_output) { - size_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + size_t index = blockIdx.x * blockDim.x + threadIdx.x; rocprim::thread_store(device_output + index, device_input[index]); } TYPED_TEST(RocprimThreadOperationTests, Store) { using T = typename TestFixture::type; - constexpr uint32_t block_size = 256; - constexpr uint32_t grid_size = 128; - constexpr uint32_t size = block_size * grid_size; + static constexpr uint32_t block_size = 256; + static constexpr uint32_t grid_size = 128; + static constexpr uint32_t size = block_size * grid_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) { @@ -153,9 +157,9 @@ TYPED_TEST(RocprimThreadOperationTests, Store) // Preparing device T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(T))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), output.size() * sizeof(T))); HIP_CHECK( hipMemcpy( @@ -165,7 +169,11 @@ TYPED_TEST(RocprimThreadOperationTests, Store) ) ); - thread_store_kernel<<>>(device_input, device_output); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(thread_store_kernel), + grid_size, block_size, 0, 0, + device_input, device_output + ); // Reading results back HIP_CHECK( @@ -201,18 +209,18 @@ template __global__ void thread_reduce_kernel(Type* const device_input, Type* device_output) { - size_t input_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; - size_t output_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; + size_t input_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length; + size_t output_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length; device_output[output_index] = rocprim::thread_reduce(&device_input[input_index], sum_op()); } TYPED_TEST(RocprimThreadOperationTests, Reduction) { using T = typename TestFixture::type; - constexpr uint32_t length = 4; - constexpr uint32_t block_size = 128 / length; - constexpr uint32_t grid_size = 128; - constexpr uint32_t size = block_size * grid_size * length; + static constexpr uint32_t length = 4; + static constexpr uint32_t block_size = 128 / length; + static constexpr uint32_t grid_size = 128; + static constexpr uint32_t size = block_size * grid_size * length; sum_op operation; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) @@ -243,9 +251,9 @@ TYPED_TEST(RocprimThreadOperationTests, Reduction) // Preparing device T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(T))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), output.size() * sizeof(T))); HIP_CHECK( hipMemcpy( @@ -255,7 +263,11 @@ TYPED_TEST(RocprimThreadOperationTests, Reduction) ) ); - thread_reduce_kernel<<>>(device_input, device_output); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(thread_reduce_kernel), + grid_size, block_size, 0, 0, + device_input, device_output + ); // Reading results back HIP_CHECK( @@ -282,8 +294,8 @@ template __global__ void thread_scan_kernel(Type* const device_input, Type* device_output) { - size_t input_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; - size_t output_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length; + size_t input_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length; + size_t output_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length; rocprim::thread_scan_inclusive(&device_input[input_index], &device_output[output_index], @@ -293,10 +305,10 @@ void thread_scan_kernel(Type* const device_input, Type* device_output) TYPED_TEST(RocprimThreadOperationTests, Scan) { using T = typename TestFixture::type; - constexpr uint32_t length = 4; - constexpr uint32_t block_size = 128 / length; - constexpr uint32_t grid_size = 128; - constexpr uint32_t size = block_size * grid_size * length; + static constexpr uint32_t length = 4; + static constexpr uint32_t block_size = 128 / length; + static constexpr uint32_t grid_size = 128; + static constexpr uint32_t size = block_size * grid_size * length; sum_op operation; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) @@ -327,9 +339,9 @@ TYPED_TEST(RocprimThreadOperationTests, Scan) // Preparing device T* device_input; - HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_input), input.size() * sizeof(T))); T* device_output; - HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&device_output), output.size() * sizeof(T))); HIP_CHECK( hipMemcpy( @@ -339,7 +351,11 @@ TYPED_TEST(RocprimThreadOperationTests, Scan) ) ); - thread_scan_kernel<<>>(device_input, device_output); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(thread_scan_kernel), + grid_size, block_size, 0, 0, + device_input, device_output + ); // Reading results back HIP_CHECK( diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp index 702ffc71e..a0e2f76ff 100644 --- a/test/rocprim/test_utils.hpp +++ b/test/rocprim/test_utils.hpp @@ -61,13 +61,13 @@ struct precision_threshold // Support half operators on host side ROCPRIM_HOST inline -_Float16 half_to_native(const rocprim::half& x) +rocprim::native_half half_to_native(const rocprim::half& x) { - return *reinterpret_cast(&x); + return *reinterpret_cast(&x); } ROCPRIM_HOST inline -rocprim::half native_to_half(const _Float16& x) +rocprim::half native_to_half(const rocprim::native_half& x) { return *reinterpret_cast(&x); } @@ -215,23 +215,43 @@ struct half_minimum } }; -template -inline auto get_random_data(size_t size, T min, T max, int seed_value) +// std::uniform_int_distribution is undefined for anything other than listed +// https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution +template +struct is_valid_for_int_distribution : + std::integral_constant::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value + > {}; + +template +inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); - std::uniform_int_distribution distribution(min, max); + engine_type gen{seed_value}; + using dis_type = typename std::conditional< + is_valid_for_int_distribution::value, + T, + typename std::conditional::value, + int, + unsigned int>::type + >::type; + std::uniform_int_distribution distribution((dis_type)min, (dis_type)max); std::vector data(size); - uint32_t segment_size = size / random_data_generation_segments; + size_t segment_size = size / random_data_generation_segments; if(segment_size != 0) { for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) { if(segment_index % random_data_generation_repeat_strides == 0) { - T repeated_value = distribution(gen); + T repeated_value = static_cast(distribution(gen)); std::fill( data.begin() + segment_size * segment_index, data.begin() + segment_size * (segment_index + 1), @@ -243,36 +263,34 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value) std::generate( data.begin() + segment_size * segment_index, data.begin() + segment_size * (segment_index + 1), - [&]() { return distribution(gen); }); + [&]() { return static_cast(distribution(gen)); }); } } } else { - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + std::generate(data.begin(), data.end(), [&]() { return static_cast(distribution(gen)); }); } return data; } -template -inline auto get_random_data(size_t size, T min, T max, int seed_value) +template +inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> typename std::enable_if::value, std::vector>::type { - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); + engine_type gen{seed_value}; // Generate floats when T is half using dis_type = typename std::conditional::value, float, T>::type; - std::uniform_real_distribution distribution(min, max); + std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); std::vector data(size); - uint32_t segment_size = size / random_data_generation_segments; + size_t segment_size = size / random_data_generation_segments; if(segment_size != 0) { for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) { if(segment_index % random_data_generation_repeat_strides == 0) { - T repeated_value = distribution(gen); + T repeated_value = static_cast(distribution(gen)); std::fill( data.begin() + segment_size * segment_index, data.begin() + segment_size * (segment_index + 1), @@ -284,25 +302,23 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value) std::generate( data.begin() + segment_size * segment_index, data.begin() + segment_size * (segment_index + 1), - [&]() { return distribution(gen); }); + [&]() { return static_cast(distribution(gen)); }); } } } else { - std::generate(data.begin(), data.end(), [&]() { return distribution(gen); }); + std::generate(data.begin(), data.end(), [&]() { return static_cast(distribution(gen)); }); } return data; } template -inline std::vector get_random_data01(size_t size, float p, int seed_value) +inline std::vector get_random_data01(size_t size, float p, seed_type seed_value) { const size_t max_random_size = 1024 * 1024; - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); + engine_type gen{seed_value}; std::bernoulli_distribution distribution(p); std::vector data(size); std::generate( @@ -316,11 +332,11 @@ inline std::vector get_random_data01(size_t size, float p, int seed_value) return data; } -template -inline auto get_random_value(T min, T max, int seed_value) +template +inline auto get_random_value(U min, V max, seed_type seed_value) -> typename std::enable_if::value, T>::type { - return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; + return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; } // Can't use std::prefix_sum for inclusive/exclusive scan, because @@ -772,18 +788,16 @@ struct numeric_limits : public std::conditional< }; template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value) +inline auto get_random_data(size_t size, T min, T max, seed_type seed_value) -> typename std::enable_if< is_custom_test_type::value && std::is_integral::value, std::vector >::type { - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); - std::uniform_int_distribution distribution(min, max); + engine_type gen(seed_value); + std::uniform_int_distribution distribution(min.x, max.x); std::vector data(size); - uint32_t segment_size = size / random_data_generation_segments; + size_t segment_size = size / random_data_generation_segments; if(segment_size != 0) { for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) @@ -814,18 +828,16 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: } template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value) +inline auto get_random_data(size_t size, T min, T max, seed_type seed_value) -> typename std::enable_if< is_custom_test_type::value && std::is_floating_point::value, std::vector >::type { - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); - std::uniform_real_distribution distribution(min, max); + engine_type gen(seed_value); + std::uniform_real_distribution distribution(min.x, max.x); std::vector data(size); - uint32_t segment_size = size / random_data_generation_segments; + size_t segment_size = size / random_data_generation_segments; if(segment_size != 0) { for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++) @@ -856,15 +868,13 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: } template -inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value) +inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, seed_type seed_value) -> typename std::enable_if< is_custom_test_array_type::value && std::is_integral::value, std::vector >::type { - std::random_device rd; - std::default_random_engine gen(rd()); - gen.seed(seed_value); + engine_type gen(seed_value); std::uniform_int_distribution distribution(min, max); std::vector data(size); std::generate( @@ -883,10 +893,10 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T: } template -inline auto get_random_value(typename T::value_type min, typename T::value_type max, int seed_value) +inline auto get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value) -> typename std::enable_if::value || is_custom_test_array_type::value, T>::type { - return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; + return get_random_data(random_data_generation_segments, min, max, seed_value)[0]; } template diff --git a/test/rocprim/test_warp_reduce.cpp b/test/rocprim/test_warp_reduce.cpp index 1f938ddb4..dc07a3bb1 100644 --- a/test/rocprim/test_warp_reduce.cpp +++ b/test/rocprim/test_warp_reduce.cpp @@ -45,9 +45,9 @@ __global__ __launch_bounds__(BlockSize) void warp_reduce_sum_kernel(T* device_input, T* device_output) { - constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; + static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -55,7 +55,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output) __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, storage[warp_id]); - if(hipThreadIdx_x%LogicalWarpSize == 0) + if(threadIdx.x%LogicalWarpSize == 0) { device_output[index/LogicalWarpSize] = value; } @@ -70,28 +70,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum) // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -109,15 +109,15 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); // used for input - std::vector output(input.size() / logical_warp_size, 0); + std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); + std::vector output(input.size() / logical_warp_size, (T)0); // Calculate expected results on host - std::vector expected(output.size(), 1); + std::vector expected(output.size(), (T)1); binary_op_type binary_op; for(size_t i = 0; i < output.size(); i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < logical_warp_size; j++) { auto idx = i * logical_warp_size + j; @@ -186,9 +186,9 @@ __global__ __launch_bounds__(BlockSize) void warp_allreduce_sum_kernel(T* device_input, T* device_output) { - constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; + static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -208,28 +208,28 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum) // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -247,15 +247,15 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); // used for input - std::vector output(input.size(), 0); + std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); + std::vector output(input.size(), (T)0); // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / logical_warp_size; i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < logical_warp_size; j++) { auto idx = i * logical_warp_size + j; @@ -328,9 +328,9 @@ __global__ __launch_bounds__(BlockSize) void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) { - constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; + static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -338,7 +338,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid) __shared__ typename wreduce_t::storage_type storage[warps_no]; wreduce_t().reduce(value, value, valid, storage[warp_id]); - if(hipThreadIdx_x%LogicalWarpSize == 0) + if(threadIdx.x%LogicalWarpSize == 0) { device_output[index/LogicalWarpSize] = value; } @@ -353,28 +353,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid) // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; const size_t valid = logical_warp_size - 1; @@ -393,15 +393,15 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); // used for input - std::vector output(input.size() / logical_warp_size, 0); + std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); + std::vector output(input.size() / logical_warp_size, (T)0); // Calculate expected results on host - std::vector expected(output.size(), 1); + std::vector expected(output.size(), (T)1); binary_op_type binary_op; for(size_t i = 0; i < output.size(); i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < valid; j++) { auto idx = i * logical_warp_size + j; @@ -472,7 +472,7 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -492,28 +492,28 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid) // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; const size_t valid = logical_warp_size - 1; @@ -532,15 +532,15 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); // used for input - std::vector output(input.size(), 0); + std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); + std::vector output(input.size(), (T)0); // Calculate expected results on host - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); binary_op_type binary_op; for(size_t i = 0; i < output.size() / logical_warp_size; i++) { - T value = 0; + T value = (T)0; for(size_t j = 0; j < valid; j++) { auto idx = i * logical_warp_size + j; @@ -614,28 +614,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) using T = test_utils::custom_test_type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -669,7 +669,7 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct) std::vector expected(output.size()); for(size_t i = 0; i < output.size(); i++) { - T value(0, 0); + T value{(base_type)0, (base_type)0}; for(size_t j = 0; j < logical_warp_size; j++) { auto idx = i * logical_warp_size + j; @@ -741,7 +741,7 @@ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = input[index]; auto flag = flags[index]; @@ -763,28 +763,28 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; using flag_type = unsigned char; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -802,7 +802,7 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 1, 10, seed_value); // used for input + std::vector input = test_utils::get_random_data(size, 1, 10, seed_value); std::vector flags = test_utils::get_random_data01(size, 0.25f, seed_value); for(size_t i = 0; i < flags.size(); i+= logical_warp_size) { @@ -883,8 +883,8 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum) ); HIP_CHECK(hipDeviceSynchronize()); - std::vector output_segment(output.size(), 0); - std::vector expected_segment(output.size(), 0); + std::vector output_segment(output.size(), (T)0); + std::vector expected_segment(output.size(), (T)0); for(size_t i = 0; i < output.size(); i++) { if(flags[i]) @@ -914,7 +914,7 @@ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = input[index]; auto flag = flags[index]; @@ -936,28 +936,28 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; using flag_type = unsigned char; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -975,7 +975,7 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum) SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); // Generate data - std::vector input = test_utils::get_random_data(size, 1, 10, seed_value); // used for input + std::vector input = test_utils::get_random_data(size, 1, 10, seed_value); std::vector flags = test_utils::get_random_data01(size, 0.25f, seed_value); for(size_t i = logical_warp_size - 1; i < flags.size(); i+= logical_warp_size) { diff --git a/test/rocprim/test_warp_scan.cpp b/test/rocprim/test_warp_scan.cpp index 5406e07f6..371d72f62 100644 --- a/test/rocprim/test_warp_scan.cpp +++ b/test/rocprim/test_warp_scan.cpp @@ -51,7 +51,7 @@ void warp_inclusive_scan_kernel(T* device_input, T* device_output) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -71,23 +71,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -112,7 +112,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan) // Generate data std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); std::vector output(size); - std::vector expected(output.size(), 0); + std::vector expected(output.size(), (T)0); // Calculate expected results on host binary_op_type binary_op; @@ -192,7 +192,7 @@ void warp_inclusive_scan_reduce_kernel( { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + ( hipBlockIdx_x * BlockSize ); + unsigned int index = threadIdx.x + ( blockIdx.x * BlockSize ); T value = device_input[index]; T reduction; @@ -202,7 +202,7 @@ void warp_inclusive_scan_reduce_kernel( wscan_t().inclusive_scan(value, value, reduction, storage[warp_id]); device_output[index] = value; - if((hipThreadIdx_x % LogicalWarpSize) == 0) + if((threadIdx.x % LogicalWarpSize) == 0) { device_output_reductions[index / LogicalWarpSize] = reduction; } @@ -217,23 +217,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -259,8 +259,8 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce) std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); std::vector output(size); std::vector output_reductions(size / logical_warp_size); - std::vector expected(output.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); + std::vector expected(output.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); // Calculate expected results on host binary_op_type binary_op; @@ -355,7 +355,7 @@ void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init) { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; @@ -375,23 +375,23 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -416,8 +416,8 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan) // Generate data std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); std::vector output(size); - std::vector expected(input.size(), 0); - const T init = test_utils::get_random_value(0, 100, seed_value); + std::vector expected(input.size(), (T)0); + const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host binary_op_type binary_op; @@ -499,7 +499,7 @@ void warp_exclusive_scan_reduce_kernel( { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T value = device_input[index]; T reduction; @@ -509,7 +509,7 @@ void warp_exclusive_scan_reduce_kernel( wscan_t().exclusive_scan(value, value, init, reduction, storage[warp_id]); device_output[index] = value; - if((hipThreadIdx_x % LogicalWarpSize) == 0) + if((threadIdx.x % LogicalWarpSize) == 0) { device_output_reductions[index / LogicalWarpSize] = reduction; } @@ -524,23 +524,23 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -566,9 +566,9 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan) std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); std::vector output(size); std::vector output_reductions(size / logical_warp_size); - std::vector expected(input.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); - const T init = test_utils::get_random_value(0, 100, seed_value); + std::vector expected(input.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); + const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host binary_op_type binary_op; @@ -673,7 +673,7 @@ void warp_scan_kernel( { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T input = device_input[index]; T inclusive_output, exclusive_output; @@ -695,23 +695,23 @@ TYPED_TEST(RocprimWarpScanTests, Scan) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -737,9 +737,9 @@ TYPED_TEST(RocprimWarpScanTests, Scan) std::vector input = test_utils::get_random_data(size, 2, 50, seed_value); std::vector output_inclusive(size); std::vector output_exclusive(size); - std::vector expected_inclusive(output_inclusive.size(), 0); - std::vector expected_exclusive(output_exclusive.size(), 0); - const T init = test_utils::get_random_value(0, 100, seed_value); + std::vector expected_inclusive(output_inclusive.size(), (T)0); + std::vector expected_exclusive(output_exclusive.size(), (T)0); + const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host binary_op_type binary_op; @@ -848,7 +848,7 @@ void warp_scan_reduce_kernel( { constexpr unsigned int warps_no = BlockSize / LogicalWarpSize; const unsigned int warp_id = rocprim::detail::logical_warp_id(); - unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x); T input = device_input[index]; T inclusive_output, exclusive_output, reduction; @@ -859,7 +859,7 @@ void warp_scan_reduce_kernel( device_inclusive_output[index] = inclusive_output; device_exclusive_output[index] = exclusive_output; - if((hipThreadIdx_x % LogicalWarpSize) == 0) + if((threadIdx.x % LogicalWarpSize) == 0) { device_output_reductions[index / LogicalWarpSize] = reduction; } @@ -874,23 +874,23 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_plus, rocprim::plus>::type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -917,10 +917,10 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce) std::vector output_inclusive(size); std::vector output_exclusive(size); std::vector output_reductions(size / logical_warp_size); - std::vector expected_inclusive(output_inclusive.size(), 0); - std::vector expected_exclusive(output_exclusive.size(), 0); - std::vector expected_reductions(output_reductions.size(), 0); - const T init = test_utils::get_random_value(0, 100, seed_value); + std::vector expected_inclusive(output_inclusive.size(), (T)0); + std::vector expected_exclusive(output_exclusive.size(), (T)0); + std::vector expected_reductions(output_reductions.size(), (T)0); + const T init = test_utils::get_random_value(0, 100, seed_value); // Calculate expected results on host binary_op_type binary_op; @@ -1042,23 +1042,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) using base_type = typename TestFixture::params::type; using T = test_utils::custom_test_type; // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); // Block size of warp size 32 - constexpr size_t block_size_ws32 = + static constexpr size_t block_size_ws32 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws32, logical_warp_size * 4) - : rocprim::max((ws32/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws32/logical_warp_size), 1) * logical_warp_size; // Block size of warp size 64 - constexpr size_t block_size_ws64 = + static constexpr size_t block_size_ws64 = rocprim::detail::is_power_of_two(logical_warp_size) ? rocprim::max(ws64, logical_warp_size * 4) - : rocprim::max((ws64/logical_warp_size) * logical_warp_size, 1); + : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; const unsigned int current_device_warp_size = rocprim::host_warp_size(); @@ -1083,7 +1083,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType) // Generate data std::vector input(size); std::vector output(size); - std::vector expected(output.size(), T(0)); + std::vector expected(output.size(), (base_type)0); // Initializing input data { auto random_values = diff --git a/test/rocprim/test_warp_sort.cpp b/test/rocprim/test_warp_sort.cpp index f144747a1..4cfb859d0 100644 --- a/test/rocprim/test_warp_sort.cpp +++ b/test/rocprim/test_warp_sort.cpp @@ -60,7 +60,7 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void test_hip_warp_sort(T* d_output) { - unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int i = threadIdx.x + (blockIdx.x * blockDim.x); T value = d_output[i]; rocprim::warp_sort wsort; wsort.sort(value); @@ -76,16 +76,16 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, Sort) // logical warp side for warp primitive, execution warp size is always rocprim::warp_size() using T = typename TestFixture::params::type; using binary_op_type = typename std::conditional::value, test_utils::half_less, rocprim::less>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = std::max(current_device_warp_size, logical_warp_size * 4); - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported @@ -163,7 +163,7 @@ __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void test_hip_sort_key_value_kernel(KeyType* d_output_key, ValueType* d_output_value) { - unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); + unsigned int i = threadIdx.x + (blockIdx.x * blockDim.x); KeyType key = d_output_key[i]; ValueType value = d_output_value[i]; rocprim::warp_sort wsort; @@ -183,16 +183,16 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, SortKeyInt) using pair = test_utils::custom_test_type; using value_op_type = typename std::conditional::value, test_utils::half_less, rocprim::less>::type; using eq_op_type = typename std::conditional::value, test_utils::half_equal_to, rocprim::equal_to>::type; - constexpr size_t logical_warp_size = TestFixture::params::warp_size; + static constexpr size_t logical_warp_size = TestFixture::params::warp_size; // The different warp sizes - constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); - constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); + static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); + static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); const unsigned int current_device_warp_size = rocprim::host_warp_size(); const size_t block_size = std::max(current_device_warp_size, logical_warp_size * 4); - constexpr unsigned int grid_size = 4; + static constexpr unsigned int grid_size = 4; const size_t size = block_size * grid_size; // Check if warp size is supported diff --git a/test/test_hip_api.cpp b/test/test_hip_api.cpp index c4ac83f8a..6f8264ade 100644 --- a/test/test_hip_api.cpp +++ b/test/test_hip_api.cpp @@ -23,7 +23,7 @@ #include "common_test_header.hpp" template -T ax(const T a, const T x) __device__ +__device__ T ax(const T a, const T x) { return x * a; } @@ -32,7 +32,7 @@ template __global__ void saxpy_kernel(const T * x, T * y, const T a, const size_t size) { - const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if(i < size) { y[i] += ax(a, x[i]);