diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c0caebcea..76984837e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -81,6 +81,25 @@ variables:
     paths:
       - $CMAKE_LATEST_PATH
 
+.deps-cpu:cmake-latest:
+  stage: build:cmake_latest
+  before_script:
+    - $SUDO_CMD apt update -qq
+    - $SUDO_CMD apt install -y -qq apt-transport-https software-properties-common
+    - $SUDO_CMD add-apt-repository ppa:ubuntu-toolchain-r/test
+    # | Used in the script    | Build tools | Fetch from https:// | rocminfo calls lsmod
+    - $SUDO_CMD apt install -y -qq wget tar xz-utils bzip2 libnuma-dev libunwind-dev git build-essential pkg-config ninja-build ca-certificates kmod g++-9
+    # Fetch CMake only if the cache has not been restored
+    - if [ ! -d $CMAKE_LATEST_PATH ]; then mkdir -p $CMAKE_LATEST_PATH; wget --no-check-certificate --quiet -O - $CMAKE_LATEST_URL | tar --strip-components=1 -xz -C $CMAKE_LATEST_PATH;
+    - fi;
+    - export PATH=$CMAKE_LATEST_PATH/bin:$PATH
+    # Debug printing of environment for context when errors occur
+    - hipconfig
+  cache:
+    key: $CMAKE_LATEST_VERSION
+    paths:
+      - $CMAKE_LATEST_PATH
+
 build:cmake-latest:
   extends:
     - .deps:cmake-latest
@@ -92,6 +111,7 @@ build:cmake-latest:
     - cmake
       -G Ninja
       -D CMAKE_CXX_COMPILER=hipcc
+      -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
       -D CMAKE_BUILD_TYPE=Release
       -D BUILD_TEST=ON
       -D BUILD_EXAMPLE=ON
@@ -112,6 +132,31 @@ build:cmake-latest:
     - $BUILD_LATEST_DIR/CTestTestfile.cmake
     expire_in: 2 weeks
 
+# TODO: Enable the hip-cpu CI step
+#build-cpu:cmake-latest:
+#  extends:
+#    - .deps-cpu:cmake-latest
+#  tags:
+#    - s9300
+#    - rocm
+#  script:
+#    - mkdir -p $BUILD_LATEST_DIR
+#    - cd $BUILD_LATEST_DIR
+#    - cmake
+#      -G Ninja
+#      -D CMAKE_CXX_COMPILER=g++-9
+#      -D CMAKE_CXX_FLAGS="-Wall -Wextra"
+#      -D CMAKE_BUILD_TYPE=Release
+#      -D BUILD_TEST=ON
+#      -D BUILD_EXAMPLE=OFF
+#      -D BUILD_BENCHMARK=OFF
+#      -D USE_HIP_CPU=ON
+#      -S $CI_PROJECT_DIR
+#      -B $BUILD_LATEST_DIR
+#    - cmake
+#      --build $BUILD_LATEST_DIR
+#      --parallel 3
+
 build:cmake-minimum:
   extends:
     - .deps:cmake-minimum
@@ -126,6 +171,7 @@ build:cmake-minimum:
     - cmake
       -G Ninja
       -D CMAKE_CXX_COMPILER=hipcc
+      -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
       -D CMAKE_BUILD_TYPE=Release
       -D BUILD_TEST=ON
       -D BUILD_EXAMPLE=ON
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2b7197f82..e83746b71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 Full documentation for rocPRIM is available at [https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/](https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/)
 
+## [Unreleased rocPRIM-Next]
+### Added
+- Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Win/Linux. It is work in progress, many algorithms still known to fail.
+
 ## [Unreleased rocPRIM-2.10.11 for ROCm 4.4.0]
 ### Added
 - Code coverage tools build option
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0c3dae44..ca6490c0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,14 @@ list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm )
 # rocPRIM project
 project(rocprim LANGUAGES CXX)
 
+# Build options
+option(BUILD_TEST "Build tests (requires googletest)" OFF)
+option(BUILD_BENCHMARK "Build benchmarks" OFF)
+option(BUILD_EXAMPLE "Build examples" OFF)
+option(USE_HIP_CPU "Prefer HIP-CPU runtime instead of HW acceleration" OFF)
+# Disables building tests, benchmarks, examples
+option(ONLY_INSTALL "Only install" OFF)
+
 # CMake modules
 list(APPEND CMAKE_MODULE_PATH
   ${CMAKE_CURRENT_SOURCE_DIR}/cmake
@@ -46,36 +54,35 @@ endif()
 
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath")
 
-# Get dependencies
-include(cmake/Dependencies.cmake)
-
-# Set the AMDGPU_TARGETS with backward compatiblity
-# Use target ID syntax if supported for AMDGPU_TARGETS
-if(COMMAND rocm_check_target_ids)
-  rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
-    TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
-  )
-else()
-  # Detect compiler support for target ID
-  # This section is deprecated. Please use rocm_check_target_ids for future use.
-  if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
-    execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help"
-      OUTPUT_VARIABLE CXX_OUTPUT
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-      ERROR_STRIP_TRAILING_WHITESPACE)
-    string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT})
-  endif()
-  if(TARGET_ID_SUPPORT)
-    set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-")
+if(NOT USE_HIP_CPU)
+  # Set the AMDGPU_TARGETS with backward compatiblity
+  # Use target ID syntax if supported for AMDGPU_TARGETS
+  if(COMMAND rocm_check_target_ids)
+    rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
+      TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
+    )
   else()
-    set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908")
+    # Detect compiler support for target ID
+    # This section is deprecated. Please use rocm_check_target_ids for future use.
+    if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
+      execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help"
+        OUTPUT_VARIABLE CXX_OUTPUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_STRIP_TRAILING_WHITESPACE)
+      string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT})
+    endif()
+    if(TARGET_ID_SUPPORT)
+      set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030")
+    else()
+      set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908;gfx1030")
+    endif()
   endif()
-endif()
-set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
-set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device
+  set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+  set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device
 
-# Verify that hcc compiler is used on ROCM platform
-include(cmake/VerifyCompiler.cmake)
+  # Verify that hcc compiler is used on ROCM platform
+  include(cmake/VerifyCompiler.cmake)
+endif()
 
 # Build options
 # Disable -Werror
@@ -98,11 +105,8 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-if(DISABLE_WERROR)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
-endif()
+# Get dependencies
+include(cmake/Dependencies.cmake)
 
 # Setup VERSION
 set(VERSION_STRING "2.10.9")
diff --git a/README.md b/README.md
index 8409a0894..3e098f0c3 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,11 @@ cd rocPRIM; mkdir build; cd build
 # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler.
 # Using HIP-clang:
 [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../.
+#
+# ! EXPERIMENTAL !
+# Alternatively one may build using the experimental (and highly incomplete) HIP-CPU back-end for host-side
+# execution using any C++17 conforming compiler (supported by HIP-CPU). AMDGPU_* options are unavailable in this case. 
+#   USE_HIP_CPU - OFF by default
 
 # Build
 make -j4
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 95b60c539..9f289745c 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -24,18 +24,40 @@ option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various c
 
 function(add_rocprim_benchmark BENCHMARK_SOURCE)
   get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE)
+
   add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE})
+
   target_link_libraries(${BENCHMARK_TARGET}
     PRIVATE
-      rocprim_hip
+      rocprim
       benchmark::benchmark
   )
-  foreach(amdgpu_target ${AMDGPU_TARGETS})
+  if(NOT USE_HIP_CPU)
+    target_link_libraries(${BENCHMARK_TARGET}
+      PRIVATE
+        rocprim_hip
+    )
+  else()
     target_link_libraries(${BENCHMARK_TARGET}
       PRIVATE
-        --amdgpu-target=${amdgpu_target}
+        Threads::Threads
+        hip_cpu_rt::hip_cpu_rt
     )
-  endforeach()
+    if(STL_DEPENDS_ON_TBB)
+      target_link_libraries(${BENCHMARK_TARGET}
+        PRIVATE
+          TBB::tbb
+      )
+    endif()
+  endif()
+
+  target_compile_options(${BENCHMARK_TARGET}
+    PRIVATE
+      $<$<CXX_COMPILER_ID:MSVC>:
+        /bigobj # number of sections exceeded object file format limit: compile with /bigobj
+      >
+  )
+
   set_target_properties(${BENCHMARK_TARGET}
     PROPERTIES
       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark"
diff --git a/benchmark/benchmark_block_discontinuity.cpp b/benchmark/benchmark_block_discontinuity.cpp
index 627123b21..fc7b4ea22 100644
--- a/benchmark/benchmark_block_discontinuity.cpp
+++ b/benchmark/benchmark_block_discontinuity.cpp
@@ -82,13 +82,13 @@ struct flag_heads
     __device__
     static void run(const T * d_input, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_discontinuity<T, BlockSize> bdiscontinuity;
@@ -125,13 +125,13 @@ struct flag_tails
     __device__
     static void run(const T * d_input, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_discontinuity<T, BlockSize> bdiscontinuity;
@@ -168,13 +168,13 @@ struct flag_heads_and_tails
     __device__
     static void run(const T * d_input, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_discontinuity<T, BlockSize> bdiscontinuity;
@@ -217,8 +217,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     std::vector<T> input = get_random_data<T>(size, T(0), T(10));
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_exchange.cpp b/benchmark/benchmark_block_exchange.cpp
index b8c30684c..2cdc964f1 100644
--- a/benchmark/benchmark_block_exchange.cpp
+++ b/benchmark/benchmark_block_exchange.cpp
@@ -80,13 +80,13 @@ struct blocked_to_striped
     __device__
     static void run(const T * d_input, const unsigned int *, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -109,13 +109,13 @@ struct striped_to_blocked
     __device__
     static void run(const T * d_input, const unsigned int *, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -138,13 +138,13 @@ struct blocked_to_warp_striped
     __device__
     static void run(const T * d_input, const unsigned int *, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -167,13 +167,13 @@ struct warp_striped_to_blocked
     __device__
     static void run(const T * d_input, const unsigned int *, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -196,15 +196,15 @@ struct scatter_to_blocked
     __device__
     static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         unsigned int ranks[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
         rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -227,15 +227,15 @@ struct scatter_to_striped
     __device__
     static void run(const T * d_input, const unsigned int * d_ranks, T * d_output)
     {
-        const unsigned int lid = hipThreadIdx_x;
-        const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+        const unsigned int lid = threadIdx.x;
+        const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
         T input[ItemsPerThread];
         unsigned int ranks[ItemsPerThread];
         rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);
         rp::block_load_direct_striped<BlockSize>(lid, d_ranks + block_offset, ranks);
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             rp::block_exchange<T, BlockSize, ItemsPerThread> exchange;
@@ -277,9 +277,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     T * d_input;
     unsigned int * d_ranks;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_ranks), size * sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_histogram.cpp b/benchmark/benchmark_block_histogram.cpp
index 46bcfe5d5..8bc49aaf6 100644
--- a/benchmark/benchmark_block_histogram.cpp
+++ b/benchmark/benchmark_block_histogram.cpp
@@ -83,8 +83,9 @@ struct histogram
     __device__
     static void run(const T* input, T* output)
     {
-        const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
-        unsigned int global_offset = hipBlockIdx_x * BinSize;
+        // TODO: Move global_offset into final loop
+        const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+        unsigned int global_offset = blockIdx.x * BinSize;
 
         T values[ItemsPerThread];
         for(unsigned int k = 0; k < ItemsPerThread; k++)
@@ -96,18 +97,18 @@ struct histogram
         __shared__ T histogram[BinSize];
         __shared__ typename bhistogram_t::storage_type storage;
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             bhistogram_t().histogram(values, histogram, storage);
         }
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for (unsigned int offset = 0; offset < BinSize; offset += BlockSize)
         {
-            if(offset + hipThreadIdx_x < BinSize)
+            if(offset + threadIdx.x < BinSize)
             {
-                output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x];
+                output[global_offset + threadIdx.x] = histogram[offset + threadIdx.x];
                 global_offset += BlockSize;
             }
         }
@@ -132,8 +133,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     std::vector<T> input(size, 0.0f);
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), bin_size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_radix_sort.cpp b/benchmark/benchmark_block_radix_sort.cpp
index 09707df88..992a3f2b5 100644
--- a/benchmark/benchmark_block_radix_sort.cpp
+++ b/benchmark/benchmark_block_radix_sort.cpp
@@ -71,13 +71,13 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_keys_kernel(const T * input, T * output)
 {
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
     T keys[ItemsPerThread];
     rp::block_load_direct_striped<BlockSize>(lid, input + block_offset, keys);
 
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         rp::block_radix_sort<T, BlockSize, ItemsPerThread> sort;
@@ -97,8 +97,8 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_pairs_kernel(const T * input, T * output)
 {
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
 
     T keys[ItemsPerThread];
     T values[ItemsPerThread];
@@ -108,7 +108,7 @@ void sort_pairs_kernel(const T * input, T * output)
         values[i] = keys[i] + T(1);
     }
 
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         rp::block_radix_sort<T, BlockSize, ItemsPerThread, T> sort;
@@ -148,8 +148,8 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS
     }
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_reduce.cpp b/benchmark/benchmark_block_reduce.cpp
index 3ea932a62..1bc7d414f 100644
--- a/benchmark/benchmark_block_reduce.cpp
+++ b/benchmark/benchmark_block_reduce.cpp
@@ -81,7 +81,7 @@ struct reduce
     __device__
     static void run(const T* input, T* output)
     {
-        const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+        const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
         T values[ItemsPerThread];
         T reduced_value;
@@ -93,16 +93,16 @@ struct reduce
         using breduce_t = rp::block_reduce<T, BlockSize, algorithm>;
         __shared__ typename breduce_t::storage_type storage;
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             breduce_t().reduce(values, reduced_value, storage);
             values[0] = reduced_value;
         }
 
-        if(hipThreadIdx_x == 0)
+        if(threadIdx.x == 0)
         {
-            output[hipBlockIdx_x] = reduced_value;
+            output[blockIdx.x] = reduced_value;
         }
     }
 };
@@ -123,8 +123,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     std::vector<T> input(size, T(1));
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_scan.cpp b/benchmark/benchmark_block_scan.cpp
index 4e150055a..ef95bc5ed 100644
--- a/benchmark/benchmark_block_scan.cpp
+++ b/benchmark/benchmark_block_scan.cpp
@@ -81,7 +81,7 @@ struct inclusive_scan
     __device__
     static void run(const T* input, T* output)
     {
-        const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+        const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
         T values[ItemsPerThread];
         for(unsigned int k = 0; k < ItemsPerThread; k++)
@@ -92,7 +92,7 @@ struct inclusive_scan
         using bscan_t = rp::block_scan<T, BlockSize, algorithm>;
         __shared__ typename bscan_t::storage_type storage;
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             bscan_t().inclusive_scan(values, values, storage);
@@ -118,7 +118,7 @@ struct exclusive_scan
     __device__
     static void run(const T* input, T* output)
     {
-        const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+        const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
         using U = typename std::remove_reference<T>::type;
 
         T values[ItemsPerThread];
@@ -132,7 +132,7 @@ struct exclusive_scan
         using bscan_t = rp::block_scan<T, BlockSize, algorithm>;
         __shared__ typename bscan_t::storage_type storage;
 
-        #pragma nounroll
+        ROCPRIM_NO_UNROLL
         for(unsigned int trial = 0; trial < Trials; trial++)
         {
             bscan_t().exclusive_scan(values, values, init, storage);
@@ -162,8 +162,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     std::vector<T> input(size, T(1));
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_block_sort.cpp b/benchmark/benchmark_block_sort.cpp
index 8b7108e1f..e56faaa25 100644
--- a/benchmark/benchmark_block_sort.cpp
+++ b/benchmark/benchmark_block_sort.cpp
@@ -70,11 +70,11 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_keys_kernel(const T * input, T * output)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
 
     T key = input[index];
 
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         rp::block_sort<T, BlockSize> bsort;
@@ -93,12 +93,12 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_pairs_kernel(const T * input, T * output)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
 
     T key = input[index];
     T value = key + T(1);
 
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         rp::block_sort<T, BlockSize, T> bsort;
@@ -115,7 +115,7 @@ template<
 >
 void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N)
 {
-    constexpr auto block_size = BlockSize;
+    static constexpr auto block_size = BlockSize;
     const auto size = block_size * ((N + block_size - 1)/block_size);
 
     std::vector<T> input;
@@ -133,8 +133,8 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS
     }
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -151,16 +151,16 @@ void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipS
         if(benchmark_kind == benchmark_kinds::sort_keys)
         {
             hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(sort_keys_kernel<T, BlockSize, Trials>),
-                dim3(size/block_size), dim3(BlockSize), 0, stream,
+                HIP_KERNEL_NAME(sort_keys_kernel<T, block_size, Trials>),
+                dim3(size/block_size), dim3(block_size), 0, stream,
                 d_input, d_output
             );
         }
         else if(benchmark_kind == benchmark_kinds::sort_pairs)
         {
             hipLaunchKernelGGL(
-                HIP_KERNEL_NAME(sort_pairs_kernel<T, BlockSize, Trials>),
-                dim3(size/block_size), dim3(BlockSize), 0, stream,
+                HIP_KERNEL_NAME(sort_pairs_kernel<T, block_size, Trials>),
+                dim3(size/block_size), dim3(block_size), 0, stream,
                 d_input, d_output
             );
         }
diff --git a/benchmark/benchmark_device_binary_search.cpp b/benchmark/benchmark_device_binary_search.cpp
index 0cce76b18..16291f317 100644
--- a/benchmark/benchmark_device_binary_search.cpp
+++ b/benchmark/benchmark_device_binary_search.cpp
@@ -82,9 +82,9 @@ void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream,
     haystack_type * d_haystack;
     needle_type * d_needles;
     output_type * d_output;
-    HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(haystack_type)));
-    HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(needle_type)));
-    HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(output_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_haystack), haystack_size * sizeof(haystack_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_needles), needles_size * sizeof(needle_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), needles_size * sizeof(output_type)));
     HIP_CHECK(
         hipMemcpy(
             d_haystack, haystack.data(),
diff --git a/benchmark/benchmark_device_histogram.cpp b/benchmark/benchmark_device_histogram.cpp
index be2bba4d8..fa50c53a2 100644
--- a/benchmark/benchmark_device_histogram.cpp
+++ b/benchmark/benchmark_device_histogram.cpp
@@ -62,7 +62,7 @@ std::vector<T> generate(size_t size, int entropy_reduction, int lower_level, int
 {
     if(entropy_reduction >= 5)
     {
-        return std::vector<T>(size, (lower_level + upper_level) / 2);
+        return std::vector<T>(size, (T)((lower_level + upper_level) / 2));
     }
 
     const size_t max_random_size = 1024 * 1024;
@@ -125,8 +125,8 @@ void run_even_benchmark(benchmark::State& state,
 
     T * d_input;
     counter_type * d_histogram;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_histogram), size * sizeof(counter_type)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -221,10 +221,10 @@ void run_multi_even_benchmark(benchmark::State& state,
 
     T * d_input;
     counter_type * d_histogram[ActiveChannels];
-    HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * Channels * sizeof(T)));
     for(unsigned int channel = 0; channel < ActiveChannels; channel++)
     {
-        HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_histogram[channel]), bins * sizeof(counter_type)));
     }
     HIP_CHECK(
         hipMemcpy(
@@ -312,9 +312,9 @@ void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t strea
     T * d_input;
     T * d_levels;
     counter_type * d_histogram;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_levels), (bins + 1) * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_histogram), size * sizeof(counter_type)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_device_memory.cpp b/benchmark/benchmark_device_memory.cpp
index 74edd49fe..2e0edae6d 100644
--- a/benchmark/benchmark_device_memory.cpp
+++ b/benchmark/benchmark_device_memory.cpp
@@ -46,7 +46,6 @@
 
 enum memory_operation_method
 {
-    memcpy,
     block_primitives_transpose,
     striped,
     vectorized,
@@ -96,11 +95,11 @@ struct operation<custom_operation, T, ItemsPerThread, BlockSize>
         (void) shared_storage;
         (void) shared_storage_size;
         (void) global_mem_output;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             input[i] = input[i] + 666;
-            #pragma unroll
+            ROCPRIM_UNROLL
             for(unsigned int j = 0; j < repeats; j++)
             {
                 input[i] = input[i] * (input[j % ItemsPerThread]);
@@ -153,9 +152,9 @@ struct operation<atomics_no_collision, T, ItemsPerThread, BlockSize>
         (void) shared_storage;
         (void) shared_storage_size;
         (void) input;
-        unsigned int index = hipThreadIdx_x * ItemsPerThread +
-                             hipBlockIdx_x * hipBlockDim_x * ItemsPerThread;
-        #pragma unroll
+        unsigned int index = threadIdx.x * ItemsPerThread +
+                             blockIdx.x * blockDim.x * ItemsPerThread;
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             atomicAdd(&global_mem_output[index + i], T(666));
@@ -175,9 +174,9 @@ struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize>
         (void) shared_storage;
         (void) shared_storage_size;
         (void) input;
-        unsigned int index = (hipThreadIdx_x % warpSize) * ItemsPerThread +
-                             hipBlockIdx_x * hipBlockDim_x * ItemsPerThread;
-        #pragma unroll
+        unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread +
+                             blockIdx.x * blockDim.x * ItemsPerThread;
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             atomicAdd(&global_mem_output[index + i], T(666));
@@ -197,8 +196,8 @@ struct operation<atomics_inter_block_collision, T, ItemsPerThread, BlockSize>
         (void) shared_storage;
         (void) shared_storage_size;
         (void) input;
-        unsigned int index = hipThreadIdx_x * ItemsPerThread;
-        #pragma unroll
+        unsigned int index = threadIdx.x * ItemsPerThread;
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             atomicAdd(&global_mem_output[index + i], T(666));
@@ -236,7 +235,7 @@ void operation_kernel(T* input, T* output, CustomOp op)
         typename block_store_type::storage_type store;
     } storage;
 
-    int offset = hipBlockIdx_x * items_per_block;
+    int offset = blockIdx.x * items_per_block;
 
     T items[ItemsPerThread];
     load.load(input + offset, items, storage.load);
@@ -260,17 +259,17 @@ __launch_bounds__(BlockSize)
 void operation_kernel(T* input, T* output, CustomOp op)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
-    int offset = hipBlockIdx_x * items_per_block;
+    int offset = blockIdx.x * items_per_block;
     T items[ItemsPerThread];
 
     rocprim::block_load_direct_blocked_vectorized<T, T, ItemsPerThread>
-        (hipThreadIdx_x, input + offset, items);
+        (threadIdx.x, input + offset, items);
     __syncthreads();
 
     op(items, nullptr, 0, output);
 
     rocprim::block_store_direct_blocked_vectorized<T, T, ItemsPerThread>
-        (hipThreadIdx_x, output + offset, items);
+        (threadIdx.x, output + offset, items);
 }
 
 // striped method base kernel
@@ -287,8 +286,8 @@ __global__
 __launch_bounds__(BlockSize)
 void operation_kernel(T* input, T* output, CustomOp op)
 {
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;
     T items[ItemsPerThread];
     rocprim::block_load_direct_striped<BlockSize>(lid, input + block_offset, items);
     op(items, nullptr, 0, output);
@@ -325,7 +324,7 @@ void operation_kernel(T* input, T* output, CustomOp op)
         typename block_store_type::storage_type store;
     } storage;
 
-    int offset = hipBlockIdx_x * items_per_block;
+    int offset = blockIdx.x * items_per_block;
 
     T items[ItemsPerThread];
     load.load(input + offset, items, storage.load);
@@ -361,8 +360,8 @@ void run_benchmark(benchmark::State& state,
     }
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -432,8 +431,8 @@ void run_benchmark_memcpy(benchmark::State& state,
     }
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     // Warm-up
     for(size_t i = 0; i < 10; i++)
     {
diff --git a/benchmark/benchmark_device_merge.cpp b/benchmark/benchmark_device_merge.cpp
index 464c6d104..4c7109935 100644
--- a/benchmark/benchmark_device_merge.cpp
+++ b/benchmark/benchmark_device_merge.cpp
@@ -78,9 +78,9 @@ void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_
     key_type * d_keys_input1;
     key_type * d_keys_input2;
     key_type * d_keys_output;
-    HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input1), size1 * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input2), size2 * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input1, keys_input1.data(),
@@ -179,12 +179,12 @@ void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size
     value_type * d_values_input1;
     value_type * d_values_input2;
     value_type * d_values_output;
-    HIP_CHECK(hipMalloc(&d_keys_input1, size1 * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_input2, size2 * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_values_input1, size1 * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_values_input2, size2 * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input1), size1 * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input2), size2 * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input1), size1 * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input2), size2 * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_output), size * sizeof(value_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input1, keys_input1.data(),
diff --git a/benchmark/benchmark_device_merge_sort.cpp b/benchmark/benchmark_device_merge_sort.cpp
index c3ca8de70..9d4cd83b4 100644
--- a/benchmark/benchmark_device_merge_sort.cpp
+++ b/benchmark/benchmark_device_merge_sort.cpp
@@ -80,8 +80,8 @@ void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input.data(),
@@ -173,8 +173,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input.data(),
@@ -185,8 +185,8 @@ void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_
 
     value_type * d_values_input;
     value_type * d_values_output;
-    HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input), size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_output), size * sizeof(value_type)));
     HIP_CHECK(
         hipMemcpy(
             d_values_input, values_input.data(),
diff --git a/benchmark/benchmark_device_partition.cpp b/benchmark/benchmark_device_partition.cpp
index f9a55ed67..c12710533 100644
--- a/benchmark/benchmark_device_partition.cpp
+++ b/benchmark/benchmark_device_partition.cpp
@@ -80,10 +80,10 @@ void run_flagged_benchmark(benchmark::State& state,
     FlagType * d_flags;
     T * d_output;
     unsigned int * d_selected_count_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType)));
-    HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_flags), flags.size() * sizeof(FlagType)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -186,9 +186,9 @@ void run_if_benchmark(benchmark::State& state,
     T * d_input;
     T * d_output;
     unsigned int * d_selected_count_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_device_radix_sort.cpp b/benchmark/benchmark_device_radix_sort.cpp
index 268c4d137..9d678fe59 100644
--- a/benchmark/benchmark_device_radix_sort.cpp
+++ b/benchmark/benchmark_device_radix_sort.cpp
@@ -80,10 +80,10 @@ void run_sort_keys_benchmark(benchmark::State& state,
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
-
+    
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
+    
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input->data(),
@@ -167,10 +167,10 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
-
+    
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
+    
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input->data(),
@@ -181,10 +181,10 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
     value_type * d_values_input;
     value_type * d_values_output;
-
-    HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
-
+    
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input), size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_output), size * sizeof(value_type)));
+    
     HIP_CHECK(
         hipMemcpy(
             d_values_input, values_input.data(),
@@ -259,95 +259,251 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
 #ifdef BENCHMARK_CONFIG_TUNING
 
-#define CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, IPT2) \
-benchmarks.push_back( \
-    benchmark::RegisterBenchmark( \
-        (std::string("sort_keys") + "<" #Key ", radix_sort_config<" #LRB ", " #SRB ", kernel_config<" #BS1 ", " #IPT1 ">, kernel_config<" #BS2 ", " #IPT2 "> >").c_str(), \
-        [=](benchmark::State& state) { run_sort_keys_benchmark<Key, rocprim::radix_sort_config<LRB, SRB, rocprim::kernel_config<BS1, IPT1>, rocprim::kernel_config<BS2, IPT2> > >(state, stream, size, keys_input); } \
-    ) \
-);
-
-#define CREATE_SORT_KEYS_BENCHMARK2(Key, LRB, SRB, BS1, IPT1, BS2) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 2) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 3) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 4) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 5) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 6) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 7) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 8) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 9) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 10) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 11) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 12) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 13) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 14) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 15) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 16) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 17) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 18) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 19) \
-    CREATE_SORT_KEYS_BENCHMARK1(Key, LRB, SRB, BS1, IPT1, BS2, 20)
-
-#define CREATE_SORT_KEYS_BENCHMARK3(Key, BS1, IPT1) \
-    CREATE_SORT_KEYS_BENCHMARK2(Key, 4, 3, BS1, IPT1, 256) \
-    CREATE_SORT_KEYS_BENCHMARK2(Key, 5, 4, BS1, IPT1, 256) \
-    CREATE_SORT_KEYS_BENCHMARK2(Key, 6, 4, BS1, IPT1, 256) \
-    CREATE_SORT_KEYS_BENCHMARK2(Key, 7, 6, BS1, IPT1, 256) \
-    CREATE_SORT_KEYS_BENCHMARK2(Key, 8, 7, BS1, IPT1, 256)
+template<
+    typename Key, typename Value,
+    unsigned int LRB, unsigned int SRB,
+    unsigned int BlockSize1, unsigned int ItemsPerThread1,
+    unsigned int BlockSize2, unsigned int ItemsPerThread2
+>
+auto sort_keys_add_benchmark(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if<
+        std::is_same<Value, ::rocprim::empty_type>::value, void
+    >::type
+{
+    auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size));
+    benchmarks.push_back(
+        benchmark::RegisterBenchmark(
+            (std::string("sort_keys") + "<" + typeid(Key).name() + "radix_sort_config<" +
+             std::to_string(LRB) + ", " + std::to_string(SRB) + ", kernel_config<" +
+             std::to_string(BlockSize1) + ", " + std::to_string(ItemsPerThread1) + ">, kernel_config<" +
+             std::to_string(BlockSize2) + ", " + std::to_string(ItemsPerThread2) + "> >").c_str(),
+            [=](benchmark::State& state) {
+                run_sort_keys_benchmark<
+                    Key,
+                    rocprim::radix_sort_config<
+                        LRB,
+                        SRB,
+                        rocprim::kernel_config<BlockSize1, ItemsPerThread1>,
+                        rocprim::kernel_config<BlockSize2, ItemsPerThread2>
+                    >
+                >(state, stream, size, keys_input);
+            }
+        )
+    );
+}
 
-#define CREATE_SORT_KEYS_BENCHMARK(Key) \
-    { \
-        auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size)); \
-        CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 1) \
-        CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 2) \
-        CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 4) \
-        CREATE_SORT_KEYS_BENCHMARK3(Key, 256, 8) \
-    }
+template<
+    typename Key, typename Value,
+    unsigned int LRB, unsigned int SRB,
+    unsigned int BlockSize1, unsigned int ItemsPerThread1,
+    unsigned int BlockSize2, unsigned int ItemsPerThread2
+>
+auto sort_keys_add_benchmark(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if<
+        !std::is_same<Value, ::rocprim::empty_type>::value, void
+    >::type
+{
+    auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size));
+    benchmarks.push_back(
+        benchmark::RegisterBenchmark(
+            (std::string("sort_pairs") + "<" + typeid(Key).name() + "," + typeid(Value).name() +
+             "radix_sort_config<" + std::to_string(LRB) + ", " + std::to_string(SRB) + ", kernel_config<" +
+             std::to_string(BlockSize1) + ", " + std::to_string(ItemsPerThread1) + ">, kernel_config<" +
+             std::to_string(BlockSize2) + ", " + std::to_string(ItemsPerThread2) + "> >").c_str(),
+            [=](benchmark::State& state) {
+                run_sort_pairs_benchmark<
+                    Key, Value,
+                    rocprim::radix_sort_config<
+                        LRB,
+                        SRB,
+                        rocprim::kernel_config<BlockSize1, ItemsPerThread1>,
+                        rocprim::kernel_config<BlockSize2, ItemsPerThread2>
+                    >
+                >(state, stream, size, keys_input);
+            }
+        )
+    );
+}
 
-#define CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, IPT2) \
-benchmarks.push_back( \
-    benchmark::RegisterBenchmark( \
-        (std::string("sort_pairs") + "<" #Key ", " #Value ", radix_sort_config<" #LRB ", " #SRB ", kernel_config<" #BS1 ", " #IPT1 ">, kernel_config<" #BS2 ", " #IPT2 "> >").c_str(), \
-        [=](benchmark::State& state) { run_sort_pairs_benchmark<Key, Value, rocprim::radix_sort_config<LRB, SRB, rocprim::kernel_config<BS1, IPT1>, rocprim::kernel_config<BS2, IPT2> > >(state, stream, size, keys_input); } \
-    ) \
-);
-
-#define CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, LRB, SRB, BS1, IPT1, BS2) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 2) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 3) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 4) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 5) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 6) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 7) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 8) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 9) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 10) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 11) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 12) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 13) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 14) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 15) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 16) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 17) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 18) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 19) \
-    CREATE_SORT_PAIRS_BENCHMARK1(Key, Value, LRB, SRB, BS1, IPT1, BS2, 20)
-
-#define CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, BS1, IPT1) \
-    CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 4, 3, BS1, IPT1, 256) \
-    CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 5, 4, BS1, IPT1, 256) \
-    CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 6, 4, BS1, IPT1, 256) \
-    CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 7, 6, BS1, IPT1, 256) \
-    CREATE_SORT_PAIRS_BENCHMARK2(Key, Value, 8, 7, BS1, IPT1, 256)
+template<
+    typename Key, typename Value,
+    unsigned int LRB, unsigned int SRB,
+    unsigned int BlockSize1, unsigned int ItemsPerThread1,
+    unsigned int BlockSize2, unsigned int ItemsPerThread2,
+    unsigned int MaxItemsPerThread1
+>
+auto sort_keys_benchmark_generate_ipt1_grid(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if< ItemsPerThread1 == MaxItemsPerThread1, void>::type
+{
+    sort_keys_add_benchmark<
+        Key, Value, LRB, SRB,
+        BlockSize1, ItemsPerThread1,
+        BlockSize2, ItemsPerThread2
+    >(benchmarks, stream, size);
+}
 
-#define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \
-    { \
-        auto keys_input = std::make_shared<std::vector<Key>>(generate_keys<Key>(size)); \
-        CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 1) \
-        CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 2) \
-        CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 4) \
-        CREATE_SORT_PAIRS_BENCHMARK3(Key, Value, 256, 8) \
-    }
+template<
+    typename Key, typename Value,
+    unsigned int LRB, unsigned int SRB,
+    unsigned int BlockSize1, unsigned int ItemsPerThread1,
+    unsigned int BlockSize2, unsigned int ItemsPerThread2,
+    unsigned int MaxItemsPerThread1
+>
+auto sort_keys_benchmark_generate_ipt1_grid(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if< ItemsPerThread1 < MaxItemsPerThread1, void>::type
+{
+    sort_keys_add_benchmark<
+        Key, Value, LRB, SRB,
+        BlockSize1, ItemsPerThread1,
+        BlockSize2, ItemsPerThread2
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, LRB, SRB,
+        BlockSize1, ItemsPerThread1 + 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+}
+
+template<
+    typename Key, typename Value,
+    unsigned int BlockSize1,
+    unsigned int BlockSize2, unsigned int ItemsPerThread2,
+    unsigned int MaxItemsPerThread1
+>
+void sort_keys_benchmark_generate_radix_grid(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+{
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, 4, 3,
+        BlockSize1, 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, 5, 4,
+        BlockSize1, 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, 6, 4,
+        BlockSize1, 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, 7, 6,
+        BlockSize1, 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_ipt1_grid<
+        Key, Value, 8, 7,
+        BlockSize1, 1,
+        BlockSize2, ItemsPerThread2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+}
+
+template<
+    typename Key, typename Value = ::rocprim::empty_type,
+    unsigned int BlockSize1 = 256U,
+    unsigned int BlockSize2 = 256U,
+    unsigned int MaxItemsPerThread1 = 20U
+>
+void sort_keys_benchmark_generate(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+{
+    sort_keys_benchmark_generate_radix_grid<
+        Key, Value,
+        BlockSize1,
+        BlockSize2, 1,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_radix_grid<
+        Key, Value,
+        BlockSize1,
+        BlockSize2, 2,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_radix_grid<
+        Key, Value,
+        BlockSize1,
+        BlockSize2, 4,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate_radix_grid<
+        Key, Value,
+        BlockSize1,
+        BlockSize2, 8,
+        MaxItemsPerThread1
+    >(benchmarks, stream, size);
+}
+
+// Compilation may never finish, if the compiler needs to compile too many kernels,
+// it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
+// (all other sort_keys_benchmark_generate should be commented/removed).
+void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                              hipStream_t stream,
+                              size_t size)
+{
+    sort_keys_benchmark_generate<int>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<long long>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<int8_t>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<uint8_t>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<rocprim::half, ::rocprim::empty_type, 256U, 256U, 30>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<short>(benchmarks, stream, size);
+}
+
+void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                               hipStream_t stream,
+                               size_t size)
+{
+    //using custom_float2 = custom_type<float, float>;
+    using custom_double2 = custom_type<double, double>;
+
+    sort_keys_benchmark_generate<int, float>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<int, double>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<int, float2, 256U, 256U, 15>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<int, custom_float2, 256U, 256U, 15>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<int, double2, 256U, 256U, 15>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<int, custom_double2, 256U, 256U, 15>(benchmarks, stream, size);
+
+    sort_keys_benchmark_generate<long long, float>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<long long, double>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<long long, float2, 256U, 256U, 15>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<long long, custom_float2, 256U, 256U, 15>(benchmarks, stream, size);
+    //sort_keys_benchmark_generate<long long, double2, 256U, 256U, 15>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<long long, custom_double2, 256U, 256U, 15>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<int8_t, int8_t>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<uint8_t, uint8_t>(benchmarks, stream, size);
+    sort_keys_benchmark_generate<rocprim::half, rocprim::half, 256U, 256U, 30>(benchmarks, stream, size);
+}
 
 #else // BENCHMARK_CONFIG_TUNING
 
@@ -373,48 +529,47 @@ benchmarks.push_back( \
         ); \
     }
 
-#endif // BENCHMARK_CONFIG_TUNING
-
-// Compilation may never finish, if the compiler needs to compile too many kernels,
-// it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
-// (all other CREATE_*_BENCHMARK should be commented/removed).
-
-void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                              hipStream_t stream,
-                              size_t size)
-{
-    CREATE_SORT_KEYS_BENCHMARK(int)
-    CREATE_SORT_KEYS_BENCHMARK(long long)
-    CREATE_SORT_KEYS_BENCHMARK(int8_t)
-    CREATE_SORT_KEYS_BENCHMARK(uint8_t)
-    CREATE_SORT_KEYS_BENCHMARK(rocprim::half)
-    CREATE_SORT_KEYS_BENCHMARK(short)
-}
+    // Compilation may never finish, if the compiler needs to compile too many kernels,
+    // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
+    // (all other CREATE_*_BENCHMARK should be commented/removed).
+    void add_sort_keys_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                                  hipStream_t stream,
+                                  size_t size)
+    {
+        CREATE_SORT_KEYS_BENCHMARK(int)
+        CREATE_SORT_KEYS_BENCHMARK(long long)
+        CREATE_SORT_KEYS_BENCHMARK(int8_t)
+        CREATE_SORT_KEYS_BENCHMARK(uint8_t)
+        CREATE_SORT_KEYS_BENCHMARK(rocprim::half)
+        CREATE_SORT_KEYS_BENCHMARK(short)
+    }
 
-void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
-                               hipStream_t stream,
-                               size_t size)
-{
-    using custom_float2 = custom_type<float, float>;
-    using custom_double2 = custom_type<double, double>;
+    void add_sort_pairs_benchmarks(std::vector<benchmark::internal::Benchmark*>& benchmarks,
+                                   hipStream_t stream,
+                                   size_t size)
+    {
+        using custom_float2 = custom_type<float, float>;
+        using custom_double2 = custom_type<double, double>;
+
+        CREATE_SORT_PAIRS_BENCHMARK(int, float)
+        CREATE_SORT_PAIRS_BENCHMARK(int, double)
+        CREATE_SORT_PAIRS_BENCHMARK(int, float2)
+        CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2)
+        CREATE_SORT_PAIRS_BENCHMARK(int, double2)
+        CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2)
+
+        CREATE_SORT_PAIRS_BENCHMARK(long long, float)
+        CREATE_SORT_PAIRS_BENCHMARK(long long, double)
+        CREATE_SORT_PAIRS_BENCHMARK(long long, float2)
+        CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2)
+        CREATE_SORT_PAIRS_BENCHMARK(long long, double2)
+        CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2)
+        CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t)
+        CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t)
+        CREATE_SORT_PAIRS_BENCHMARK(rocprim::half, rocprim::half)
+    }
 
-    CREATE_SORT_PAIRS_BENCHMARK(int, float)
-    CREATE_SORT_PAIRS_BENCHMARK(int, double)
-    CREATE_SORT_PAIRS_BENCHMARK(int, float2)
-    CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2)
-    CREATE_SORT_PAIRS_BENCHMARK(int, double2)
-    CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2)
-
-    CREATE_SORT_PAIRS_BENCHMARK(long long, float)
-    CREATE_SORT_PAIRS_BENCHMARK(long long, double)
-    CREATE_SORT_PAIRS_BENCHMARK(long long, float2)
-    CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2)
-    CREATE_SORT_PAIRS_BENCHMARK(long long, double2)
-    CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2)
-    CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t)
-    CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t)
-    CREATE_SORT_PAIRS_BENCHMARK(rocprim::half, rocprim::half)
-}
+#endif // BENCHMARK_CONFIG_TUNING
 
 int main(int argc, char *argv[])
 {
diff --git a/benchmark/benchmark_device_reduce.cpp b/benchmark/benchmark_device_reduce.cpp
index bbf9f74be..ad7c27076 100644
--- a/benchmark/benchmark_device_reduce.cpp
+++ b/benchmark/benchmark_device_reduce.cpp
@@ -70,8 +70,8 @@ void run_benchmark(benchmark::State& state,
 
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_device_reduce_by_key.cpp b/benchmark/benchmark_device_reduce_by_key.cpp
index aee6be050..61a7ef062 100644
--- a/benchmark/benchmark_device_reduce_by_key.cpp
+++ b/benchmark/benchmark_device_reduce_by_key.cpp
@@ -86,7 +86,7 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     std::iota(values_input.begin(), values_input.end(), 0);
 
     key_type * d_keys_input;
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input.data(),
@@ -96,7 +96,7 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     );
 
     value_type * d_values_input;
-    HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input), size * sizeof(value_type)));
     HIP_CHECK(
         hipMemcpy(
             d_values_input, values_input.data(),
@@ -108,9 +108,9 @@ void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t strea
     key_type * d_unique_output;
     value_type * d_aggregates_output;
     unsigned int * d_unique_count_output;
-    HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_unique_output), unique_count * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_aggregates_output), unique_count * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_unique_count_output), sizeof(unsigned int)));
 
     void * d_temporary_storage = nullptr;
     size_t temporary_storage_bytes = 0;
diff --git a/benchmark/benchmark_device_run_length_encode.cpp b/benchmark/benchmark_device_run_length_encode.cpp
index a96366c62..282e4a79b 100644
--- a/benchmark/benchmark_device_run_length_encode.cpp
+++ b/benchmark/benchmark_device_run_length_encode.cpp
@@ -80,7 +80,7 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
     }
 
     key_type * d_input;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -92,9 +92,9 @@ void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_
     key_type * d_unique_output;
     count_type * d_counts_output;
     count_type * d_runs_count_output;
-    HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type)));
-    HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_unique_output), runs_count * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_counts_output), runs_count * sizeof(count_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_runs_count_output), sizeof(count_type)));
 
     void * d_temporary_storage = nullptr;
     size_t temporary_storage_bytes = 0;
@@ -183,7 +183,7 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
     }
 
     key_type * d_input;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -195,9 +195,9 @@ void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length,
     offset_type * d_offsets_output;
     count_type * d_counts_output;
     count_type * d_runs_count_output;
-    HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type)));
-    HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type)));
-    HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_offsets_output), runs_count * sizeof(offset_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_counts_output), runs_count * sizeof(count_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_runs_count_output), sizeof(count_type)));
 
     void * d_temporary_storage = nullptr;
     size_t temporary_storage_bytes = 0;
diff --git a/benchmark/benchmark_device_scan.cpp b/benchmark/benchmark_device_scan.cpp
index b474cd50e..a5086a2e0 100644
--- a/benchmark/benchmark_device_scan.cpp
+++ b/benchmark/benchmark_device_scan.cpp
@@ -117,8 +117,8 @@ void run_benchmark(benchmark::State& state,
     T initial_value = T(123);
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -186,43 +186,87 @@ void run_benchmark(benchmark::State& state,
 
 #ifdef BENCHMARK_CONFIG_TUNING
 
-#define CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, IPT) \
-benchmark::RegisterBenchmark( \
-    (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \
-    ("<" #T ", " #SCAN_OP ", scan_config<" #BS ", " #IPT ", " #BSA "> >")).c_str(), \
-    run_benchmark<EXCL, T, SCAN_OP, typename rocprim::scan_config<BS, IPT, true, rocprim::block_load_method::block_load_transpose, rocprim::block_store_method::block_store_transpose, BSA> >, size, stream, SCAN_OP() \
-),
+template<
+    bool EXCL,
+    typename T, typename SCAN_OP,
+    rocprim::block_scan_algorithm BSA,
+    unsigned int BS, unsigned int IPT
+>
+void scan_add_benchmark(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+{
+    benchmarks.push_back(
+        benchmark::RegisterBenchmark(
+            (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + "<" + typeid(T).name() +
+             ", " + typeid(SCAN_OP).name() + ", scan_config<" + std::to_string(BS) + ", " +
+             std::to_string(IPT) + ", " + std::string(BSA == rocprim::block_scan_algorithm::using_warp_scan
+             ? "using_warp_scan" : "using_reduce_scan")  + "> >").c_str(),
+            run_benchmark<
+                EXCL, T, SCAN_OP,
+                typename rocprim::scan_config<
+                    BS, IPT, true,
+                    rocprim::block_load_method::block_load_transpose,
+                    rocprim::block_store_method::block_store_transpose, BSA
+                >
+            >, size, stream, SCAN_OP()
+        )
+    );
+}
 
-#define CREATE_BENCHMARK1(EXCL, T, SCAN_OP, BSA, BS) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 1) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 2) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 3) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 4) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 5) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 6) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 7) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 8) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 9) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 10) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 11) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 12) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 13) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 14) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 15) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 16) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 17) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 18) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 19) \
-    CREATE_BENCHMARK2(EXCL, T, SCAN_OP, BSA, BS, 20)
+template<
+    bool EXCL,
+    typename T, typename SCAN_OP,
+    rocprim::block_scan_algorithm BSA,
+    unsigned int BS, unsigned int IPT,
+    unsigned int MaxItemsPerThread
+>
+auto scan_benchmark_generate_ipt_grid(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if< IPT == MaxItemsPerThread, void>::type
+{
+    scan_add_benchmark<
+        EXCL, T, SCAN_OP,
+        BSA, BS, IPT
+    >(benchmarks, stream, size);
+}
+
+template<
+    bool EXCL,
+    typename T, typename SCAN_OP,
+    rocprim::block_scan_algorithm BSA,
+    unsigned int BS, unsigned int IPT,
+    unsigned int MaxItemsPerThread
+>
+auto scan_benchmark_generate_ipt_grid(
+    std::vector<benchmark::internal::Benchmark*>& benchmarks,
+    hipStream_t stream,
+    size_t size)
+    -> typename std::enable_if< IPT < MaxItemsPerThread, void>::type
+{
+    scan_add_benchmark<
+        EXCL, T, SCAN_OP,
+        BSA, BS, IPT
+    >(benchmarks, stream, size);
+
+    scan_benchmark_generate_ipt_grid<
+        EXCL, T, SCAN_OP,
+        BSA, BS, IPT + 1,
+        MaxItemsPerThread
+    >(benchmarks, stream, size);
+}
 
 constexpr rocprim::block_scan_algorithm using_warp_scan = rocprim::block_scan_algorithm::using_warp_scan;
 constexpr rocprim::block_scan_algorithm reduce_then_scan = rocprim::block_scan_algorithm::reduce_then_scan;
 
-#define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \
-    CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 64) \
-    CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 128) \
-    CREATE_BENCHMARK1(EXCL, T, SCAN_OP, using_warp_scan, 256) \
-    CREATE_BENCHMARK1(EXCL, T, SCAN_OP, reduce_then_scan, 256)
+#define CREATE_BENCHMARK(EXCL, T, SCAN_OP, MIPT) \
+    scan_benchmark_generate_ipt_grid<EXCL, T, SCAN_OP, using_warp_scan, 64, 1, MIPT>(benchmarks, stream, size); \
+    scan_benchmark_generate_ipt_grid<EXCL, T, SCAN_OP, using_warp_scan, 128, 1, MIPT>(benchmarks, stream, size); \
+    scan_benchmark_generate_ipt_grid<EXCL, T, SCAN_OP, using_warp_scan, 256, 1, MIPT>(benchmarks, stream, size); \
+    scan_benchmark_generate_ipt_grid<EXCL, T, SCAN_OP, reduce_then_scan, 256, 1, MIPT>(benchmarks, stream, size);
 
 #else // BENCHMARK_CONFIG_TUNING
 
@@ -256,48 +300,77 @@ int main(int argc, char *argv[])
     std::cout << "[HIP] Device name: " << devProp.name << std::endl;
 
     using custom_double2 = custom_type<double, double>;
-    using custom_float2 = custom_type<float, float>;
 
-    // Compilation may never finish, if the compiler needs to compile too many kernels,
-    // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
-    // (all other CREATE_*_BENCHMARK should be commented/removed).
+#ifndef BENCHMARK_CONFIG_TUNING
+    using custom_float2 = custom_type<float, float>;
+#endif
 
     // Add benchmarks
-    std::vector<benchmark::internal::Benchmark*> benchmarks =
-    {
-        CREATE_BENCHMARK(false, int, rocprim::plus<int>)
-        CREATE_BENCHMARK(true, int, rocprim::plus<int>)
+    #ifdef BENCHMARK_CONFIG_TUNING
+        // Compilation may never finish, if the compiler needs to compile too many kernels,
+        // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used
+        // (all other CREATE_BENCHMARK should be commented/removed).
+
+        std::vector<benchmark::internal::Benchmark*> benchmarks;
+
+        CREATE_BENCHMARK(false, int, rocprim::plus<int>, 20)
+        CREATE_BENCHMARK(true, int, rocprim::plus<int>, 20)
+
+        CREATE_BENCHMARK(false, float, rocprim::plus<float>, 20)
+        CREATE_BENCHMARK(true, float, rocprim::plus<float>, 20)
+
+        CREATE_BENCHMARK(false, double, rocprim::plus<double>, 15)
+        CREATE_BENCHMARK(true, double, rocprim::plus<double>, 15)
+
+        CREATE_BENCHMARK(false, long long, rocprim::plus<long long>, 15)
+        CREATE_BENCHMARK(true, long long, rocprim::plus<long long>, 15)
+
+        CREATE_BENCHMARK(false, custom_double2, rocprim::plus<custom_double2>, 15)
+        CREATE_BENCHMARK(true, custom_double2, rocprim::plus<custom_double2>, 15)
+
+        CREATE_BENCHMARK(false, int8_t, rocprim::plus<int8_t>, 20)
+        CREATE_BENCHMARK(true, int8_t, rocprim::plus<int8_t>, 20)
+
+        CREATE_BENCHMARK(false, rocprim::half, rocprim::plus<rocprim::half>, 30)
+        CREATE_BENCHMARK(true, rocprim::half, rocprim::plus<rocprim::half>, 30)
+    #else
+        std::vector<benchmark::internal::Benchmark*> benchmarks =
+        {
+            CREATE_BENCHMARK(false, int, rocprim::plus<int>)
+            CREATE_BENCHMARK(true, int, rocprim::plus<int>)
+
+            CREATE_BENCHMARK(false, float, rocprim::plus<float>)
+            CREATE_BENCHMARK(true, float, rocprim::plus<float>)
 
-        CREATE_BENCHMARK(false, float, rocprim::plus<float>)
-        CREATE_BENCHMARK(true, float, rocprim::plus<float>)
+            CREATE_BENCHMARK(false, double, rocprim::plus<double>)
+            CREATE_BENCHMARK(true, double, rocprim::plus<double>)
 
-        CREATE_BENCHMARK(false, double, rocprim::plus<double>)
-        CREATE_BENCHMARK(true, double, rocprim::plus<double>)
+            CREATE_BENCHMARK(false, long long, rocprim::plus<long long>)
+            CREATE_BENCHMARK(true, long long, rocprim::plus<long long>)
 
-        CREATE_BENCHMARK(false, long long, rocprim::plus<long long>)
-        CREATE_BENCHMARK(true, long long, rocprim::plus<long long>)
+            CREATE_BENCHMARK(false, float2, rocprim::plus<float2>)
+            CREATE_BENCHMARK(true, float2, rocprim::plus<float2>)
 
-        CREATE_BENCHMARK(false, float2, rocprim::plus<float2>)
-        CREATE_BENCHMARK(true, float2, rocprim::plus<float2>)
+            CREATE_BENCHMARK(false, custom_float2, rocprim::plus<custom_float2>)
+            CREATE_BENCHMARK(true, custom_float2, rocprim::plus<custom_float2>)
 
-        CREATE_BENCHMARK(false, custom_float2, rocprim::plus<custom_float2>)
-        CREATE_BENCHMARK(true, custom_float2, rocprim::plus<custom_float2>)
+            CREATE_BENCHMARK(false, double2, rocprim::plus<double2>)
+            CREATE_BENCHMARK(true, double2, rocprim::plus<double2>)
 
-        CREATE_BENCHMARK(false, double2, rocprim::plus<double2>)
-        CREATE_BENCHMARK(true, double2, rocprim::plus<double2>)
+            CREATE_BENCHMARK(false, custom_double2, rocprim::plus<custom_double2>)
+            CREATE_BENCHMARK(true, custom_double2, rocprim::plus<custom_double2>)
 
-        CREATE_BENCHMARK(false, custom_double2, rocprim::plus<custom_double2>)
-        CREATE_BENCHMARK(true, custom_double2, rocprim::plus<custom_double2>)
+            CREATE_BENCHMARK(false, int8_t, rocprim::plus<int8_t>)
+            CREATE_BENCHMARK(true, int8_t, rocprim::plus<int8_t>)
 
-        CREATE_BENCHMARK(false, int8_t, rocprim::plus<int8_t>)
-        CREATE_BENCHMARK(true, int8_t, rocprim::plus<int8_t>)
+            CREATE_BENCHMARK(false, uint8_t, rocprim::plus<uint8_t>)
+            CREATE_BENCHMARK(true, uint8_t, rocprim::plus<uint8_t>)
 
-        CREATE_BENCHMARK(false, uint8_t, rocprim::plus<uint8_t>)
-        CREATE_BENCHMARK(true, uint8_t, rocprim::plus<uint8_t>)
+            CREATE_BENCHMARK(false, rocprim::half, rocprim::plus<rocprim::half>)
+            CREATE_BENCHMARK(true, rocprim::half, rocprim::plus<rocprim::half>)
+        };
+    #endif
 
-        CREATE_BENCHMARK(false, rocprim::half, rocprim::plus<rocprim::half>)
-        CREATE_BENCHMARK(true, rocprim::half, rocprim::plus<rocprim::half>)
-    };
 
     // Use manual timing
     for(auto& b : benchmarks)
diff --git a/benchmark/benchmark_device_segmented_radix_sort.cpp b/benchmark/benchmark_device_segmented_radix_sort.cpp
index 5bcd31775..9d1801563 100644
--- a/benchmark/benchmark_device_segmented_radix_sort.cpp
+++ b/benchmark/benchmark_device_segmented_radix_sort.cpp
@@ -101,7 +101,7 @@ void run_sort_keys_benchmark(benchmark::State& state,
     }
 
     offset_type * d_offsets;
-    HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_offsets), (segments_count + 1) * sizeof(offset_type)));
     HIP_CHECK(
         hipMemcpy(
             d_offsets, offsets.data(),
@@ -112,8 +112,8 @@ void run_sort_keys_benchmark(benchmark::State& state,
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input.data(),
@@ -232,7 +232,7 @@ void run_sort_pairs_benchmark(benchmark::State& state,
     std::iota(values_input.begin(), values_input.end(), 0);
 
     offset_type * d_offsets;
-    HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_offsets), (segments_count + 1) * sizeof(offset_type)));
     HIP_CHECK(
         hipMemcpy(
             d_offsets, offsets.data(),
@@ -243,8 +243,8 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
     key_type * d_keys_input;
     key_type * d_keys_output;
-    HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type)));
-    HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_input), size * sizeof(key_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_keys_output), size * sizeof(key_type)));
     HIP_CHECK(
         hipMemcpy(
             d_keys_input, keys_input.data(),
@@ -255,8 +255,8 @@ void run_sort_pairs_benchmark(benchmark::State& state,
 
     value_type * d_values_input;
     value_type * d_values_output;
-    HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
-    HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input), size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_output), size * sizeof(value_type)));
     HIP_CHECK(
         hipMemcpy(
             d_values_input, values_input.data(),
diff --git a/benchmark/benchmark_device_segmented_reduce.cpp b/benchmark/benchmark_device_segmented_reduce.cpp
index 557e18619..b3038ed7a 100644
--- a/benchmark/benchmark_device_segmented_reduce.cpp
+++ b/benchmark/benchmark_device_segmented_reduce.cpp
@@ -86,7 +86,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t
     std::iota(values_input.begin(), values_input.end(), 0);
 
     offset_type * d_offsets;
-    HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_offsets), (segments_count + 1) * sizeof(offset_type)));
     HIP_CHECK(
         hipMemcpy(
             d_offsets, offsets.data(),
@@ -96,7 +96,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t
     );
 
     value_type * d_values_input;
-    HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_values_input), size * sizeof(value_type)));
     HIP_CHECK(
         hipMemcpy(
             d_values_input, values_input.data(),
@@ -106,7 +106,7 @@ void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t
     );
 
     value_type * d_aggregates_output;
-    HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(value_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_aggregates_output), segments_count * sizeof(value_type)));
 
     rocprim::plus<value_type> reduce_op;
     value_type init(0);
diff --git a/benchmark/benchmark_device_select.cpp b/benchmark/benchmark_device_select.cpp
index 7a375909d..6a8552862 100644
--- a/benchmark/benchmark_device_select.cpp
+++ b/benchmark/benchmark_device_select.cpp
@@ -79,10 +79,10 @@ void run_flagged_benchmark(benchmark::State& state,
     FlagType * d_flags;
     T * d_output;
     unsigned int * d_selected_count_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType)));
-    HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_flags), flags.size() * sizeof(FlagType)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -188,9 +188,9 @@ void run_selectop_benchmark(benchmark::State& state,
     T * d_input;
     T * d_output;
     unsigned int * d_selected_count_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
@@ -296,9 +296,9 @@ void run_unique_benchmark(benchmark::State& state,
     T * d_input;
     T * d_output;
     unsigned int * d_selected_count_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_device_transform.cpp b/benchmark/benchmark_device_transform.cpp
index f26384df3..88c0d5499 100644
--- a/benchmark/benchmark_device_transform.cpp
+++ b/benchmark/benchmark_device_transform.cpp
@@ -79,8 +79,8 @@ void run_benchmark(benchmark::State& state,
 
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
index 9868859ca..57b28eb91 100644
--- a/benchmark/benchmark_utils.hpp
+++ b/benchmark/benchmark_utils.hpp
@@ -40,13 +40,13 @@
 // Support half operators on host side
 
 ROCPRIM_HOST inline
-_Float16 half_to_native(const rocprim::half& x)
+rocprim::native_half half_to_native(const rocprim::half& x)
 {
-    return *reinterpret_cast<const _Float16 *>(&x);
+    return *reinterpret_cast<const rocprim::native_half *>(&x);
 }
 
 ROCPRIM_HOST inline
-rocprim::half native_to_half(const _Float16& x)
+rocprim::half native_to_half(const rocprim::native_half& x)
 {
     return *reinterpret_cast<const rocprim::half *>(&x);
 }
@@ -90,15 +90,38 @@ struct half_equal_to
     }
 };
 
+// std::uniform_int_distribution is undefined for anything other than listed
+// https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
+template <typename T>
+struct is_valid_for_int_distribution :
+    std::integral_constant<bool,
+        std::is_same<short, T>::value ||
+        std::is_same<unsigned short, T>::value ||
+        std::is_same<int, T>::value ||
+        std::is_same<unsigned int, T>::value ||
+        std::is_same<long, T>::value ||
+        std::is_same<unsigned long, T>::value ||
+        std::is_same<long long, T>::value ||
+        std::is_same<unsigned long long, T>::value
+    > {};
+
+using engine_type = std::default_random_engine;
+
 // get_random_data() generates only part of sequence and replicates it,
 // because benchmarks usually do not need "true" random sequence.
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024)
     -> typename std::enable_if<rocprim::is_integral<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    std::uniform_int_distribution<T> distribution(min, max);
+    engine_type gen{std::random_device{}()};
+    using dis_type = typename std::conditional<
+        is_valid_for_int_distribution<T>::value,
+        T,
+        typename std::conditional<std::is_signed<T>::value,
+            int,
+            unsigned int>::type
+        >::type;
+    std::uniform_int_distribution<dis_type> distribution((T)min, (T)max);
     std::vector<T> data(size);
     std::generate(
         data.begin(), data.begin() + std::min(size, max_random_size),
@@ -111,14 +134,14 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
     return data;
 }
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024)
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024)
     -> typename std::enable_if<rocprim::is_floating_point<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
+    engine_type gen{std::random_device{}()};
+    // Generate floats when T is half
     using dis_type = typename std::conditional<std::is_same<rocprim::half, T>::value, float, T>::type;
-    std::uniform_real_distribution<dis_type> distribution(min, max);
+    std::uniform_real_distribution<dis_type> distribution((dis_type)min, (dis_type)max);
     std::vector<T> data(size);
     std::generate(
         data.begin(), data.begin() + std::min(size, max_random_size),
@@ -134,8 +157,7 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
 template<class T>
 inline std::vector<T> get_random_data01(size_t size, float p, size_t max_random_size = 1024 * 1024)
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
+    engine_type gen{std::random_device{}()};
     std::bernoulli_distribution distribution(p);
     std::vector<T> data(size);
     std::generate(
diff --git a/benchmark/benchmark_warp_reduce.cpp b/benchmark/benchmark_warp_reduce.cpp
index 9753ea787..7e996c6bc 100644
--- a/benchmark/benchmark_warp_reduce.cpp
+++ b/benchmark/benchmark_warp_reduce.cpp
@@ -63,13 +63,13 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_reduce_kernel(const T * d_input, T * d_output)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     auto value = d_input[i];
 
     using wreduce_t = rocprim::warp_reduce<T, WarpSize, AllReduce>;
     __shared__ typename wreduce_t::storage_type storage;
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wreduce_t().reduce(value, value, storage);
@@ -88,14 +88,14 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     auto value = d_input[i];
     auto flag = d_flags[i];
 
     using wreduce_t = rocprim::warp_reduce<T, WarpSize>;
     __shared__ typename wreduce_t::storage_type storage;
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wreduce_t().head_segmented_reduce(value, value, flag, storage);
@@ -167,9 +167,9 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
     T * d_input;
     flag_type * d_flags;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_flags), size * sizeof(flag_type)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_warp_scan.cpp b/benchmark/benchmark_warp_scan.cpp
index c36cdd1a9..97a53083e 100644
--- a/benchmark/benchmark_warp_scan.cpp
+++ b/benchmark/benchmark_warp_scan.cpp
@@ -59,12 +59,12 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_inclusive_scan_kernel(const T* input, T* output)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     auto value = input[i];
 
     using wscan_t = rp::warp_scan<T, WarpSize>;
     __shared__ typename wscan_t::storage_type storage;
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wscan_t().inclusive_scan(value, value, storage);
@@ -78,12 +78,12 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_exclusive_scan_kernel(const T* input, T* output, const T init)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     auto value = input[i];
 
     using wscan_t = rp::warp_scan<T, WarpSize>;
     __shared__ typename wscan_t::storage_type storage;
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wscan_t().exclusive_scan(value, value, init, storage);
@@ -104,11 +104,11 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
     // Make sure size is a multiple of BlockSize
     size = BlockSize * ((size + BlockSize - 1)/BlockSize);
     // Allocate and fill memory
-    std::vector<T> input(size, 1.0f);
+    std::vector<T> input(size, (T)1);
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/benchmark/benchmark_warp_sort.cpp b/benchmark/benchmark_warp_sort.cpp
index 0c611dfeb..7c491bdf7 100644
--- a/benchmark/benchmark_warp_sort.cpp
+++ b/benchmark/benchmark_warp_sort.cpp
@@ -59,11 +59,11 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_sort_kernel(K* input_key)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     auto key = input_key[i];
     rp::warp_sort<K, WarpSize> wsort;
-    #pragma nounroll
+    ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wsort.sort(key);
@@ -76,12 +76,12 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_sort_by_key_kernel(K* input_key, V* input_value)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     auto key = input_key[i];
     auto value = input_value[i];
     rp::warp_sort<K, WarpSize, V> wsort;
-     #pragma nounroll
+     ROCPRIM_NO_UNROLL
     for(unsigned int trial = 0; trial < Trials; trial++)
     {
         wsort.sort(key, value);
@@ -121,13 +121,13 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size)
     // Make sure size is a multiple of BlockSize
     size = BlockSize * ((size + BlockSize - 1)/BlockSize);
     // Allocate and fill memory
-    std::vector<Key> input_key = get_random_data(size, Key(0), get_max_value<Key>());
+    std::vector<Key> input_key = get_random_data<Key>(size, 0, get_max_value<Key>());
     std::vector<Value> input_value(size_t(1));
-    if(SortByKey) input_value = get_random_data(size, Value(0), get_max_value<Value>());
+    if(SortByKey) input_value = get_random_data<Value>(size, 0, get_max_value<Value>());
     Key * d_input_key = nullptr;
     Value * d_input_value = nullptr;
-    HIP_CHECK(hipMalloc(&d_input_key, size * sizeof(Key)));
-    if(SortByKey) HIP_CHECK(hipMalloc(&d_input_value, size * sizeof(Value)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input_key), size * sizeof(Key)));
+    if(SortByKey) HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input_value), size * sizeof(Value)));
     HIP_CHECK(
         hipMemcpy(
             d_input_key, input_key.data(),
diff --git a/benchmark/cmdparser.hpp b/benchmark/cmdparser.hpp
index a502c77be..0dfc73ca2 100644
--- a/benchmark/cmdparser.hpp
+++ b/benchmark/cmdparser.hpp
@@ -186,6 +186,13 @@ namespace cli {
             return std::stoul(elements[0]);
         }
 
+        static unsigned long long parse(const std::vector<std::string>& elements, const unsigned long long&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stoull(elements[0]);
+        }
+
         static long parse(const std::vector<std::string>& elements, const long&) {
             if (elements.size() != 1)
                 throw std::bad_cast();
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ee9e037fe..9a3dadfe6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -43,50 +43,138 @@ if (NOT Git_FOUND)
   message(FATAL_ERROR "Please ensure Git is installed on the system")
 endif()
 
+if(USE_HIP_CPU)
+  find_package(Threads REQUIRED)
+
+  set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(__GLIBCXX__ "cstddef" STL_IS_GLIBCXX)
+  set(STL_DEPENDS_ON_TBB ${STL_IS_GLIBCXX})
+  if(STL_DEPENDS_ON_TBB)
+    if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
+      # TBB (https://github.com/oneapi-src/oneTBB)
+      find_package(TBB QUIET)
+    endif()
+
+    if(NOT TBB_FOUND)
+      message(STATUS "TBB not found or force download TBB on. Downloading and building TBB.")
+      if(CMAKE_CONFIGURATION_TYPES)
+        message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.")
+      endif()
+      set(TBB_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/tbb CACHE PATH "")
+      download_project(
+        PROJ                tbb
+        GIT_REPOSITORY      https://github.com/oneapi-src/oneTBB.git
+        GIT_TAG             v2020.3
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+        INSTALL_COMMAND ""
+        UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
+      )
+      #ExternalProject_Get_Property(tbb SOURCE_DIR)
+      set(TBB_SOURCE_DIR "${CMAKE_BINARY_DIR}/tbb-src")
+      list(APPEND CMAKE_MODULE_PATH "${TBB_SOURCE_DIR}/cmake")
+      include(TBBBuild)
+      tbb_build(TBB_ROOT "${TBB_SOURCE_DIR}" CONFIG_DIR TBB_CONFIG_DIR MAKE_ARGS tbb_build_dir=${TBB_ROOT})
+    endif()
+    find_package(TBB REQUIRED CONFIG PATHS ${TBB_CONFIG_DIR} NO_DEFAULT_PATH)
+  endif(STL_DEPENDS_ON_TBB)
+
+  if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
+    # HIP CPU Runtime (https://github.com/ROCm-Developer-Tools/HIP-CPU)
+    find_package(hip_cpu_rt QUIET)
+  endif()
+
+  if(NOT hip_cpu_rt_FOUND)
+    message(STATUS "Downloading and building HIP CPU Runtime.")
+    set(HIP_CPU_ROOT "${CMAKE_CURRENT_BINARY_DIR}/deps/hip-cpu" CACHE PATH "")
+    download_project(
+      PROJ                hip-cpu
+      GIT_REPOSITORY      https://github.com/ROCm-Developer-Tools/HIP-CPU.git
+      GIT_TAG             master
+      INSTALL_DIR         "${HIP_CPU_ROOT}"
+      CMAKE_ARGS          -Dhip_cpu_rt_BUILD_EXAMPLES=OFF -Dhip_cpu_rt_BUILD_TESTING=OFF -DCMAKE_PREFIX_PATH=${TBB_CONFIG_DIR} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      LOG_DOWNLOAD        TRUE
+      LOG_CONFIGURE       TRUE
+      LOG_BUILD           TRUE
+      LOG_INSTALL         TRUE
+      BUILD_PROJECT       TRUE
+      UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
+    )
+  endif()
+  find_package(hip_cpu_rt REQUIRED CONFIG PATHS ${HIP_CPU_ROOT})
+endif()
+
 # Test dependencies
 if(BUILD_TEST)
-  # Google Test (https://github.com/google/googletest)
-  message(STATUS "Downloading and building GTest.")
-  set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "")
-  download_project(
-    PROJ           googletest
-    GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG        release-1.10.0
-    INSTALL_DIR    ${GTEST_ROOT}
-    CMAKE_ARGS     -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-    LOG_DOWNLOAD   TRUE
-    LOG_CONFIGURE  TRUE
-    LOG_BUILD      TRUE
-    LOG_INSTALL    TRUE
-    BUILD_PROJECT  TRUE
-    ${UPDATE_DISCONNECTED_IF_AVAILABLE}
-  )
-  find_package(GTest REQUIRED)
+  # NOTE: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake
+  #
+  # FindGTest.cmake defines:   GTest::GTest, GTest::Main, GTEST_FOUND
+  #
+  # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main
+  #
+  # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets
+  #        will be duplicately defined.
+  if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
+    # Google Test (https://github.com/google/googletest)
+    find_package(GTest QUIET)
+  endif()
+
+  if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest)
+    message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.")
+    if(CMAKE_CONFIGURATION_TYPES)
+      message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.")
+    endif()
+    set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/gtest CACHE PATH "")
+    download_project(
+      PROJ                googletest
+      GIT_REPOSITORY      https://github.com/google/googletest.git
+      GIT_TAG             release-1.10.0
+      INSTALL_DIR         ${GTEST_ROOT}
+      CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      LOG_DOWNLOAD        TRUE
+      LOG_CONFIGURE       TRUE
+      LOG_BUILD           TRUE
+      LOG_INSTALL         TRUE
+      BUILD_PROJECT       TRUE
+      UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository
+    )
+    find_package(GTest CONFIG REQUIRED PATHS ${GTEST_ROOT})
+  endif()
 endif()
 
 # Benchmark dependencies
 if(BUILD_BENCHMARK)
-  # Google Benchmark (https://github.com/google/benchmark.git)
-  message(STATUS "Downloading and building Google Benchmark.")
-  if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
-    # hip-clang cannot compile googlebenchmark for some reason
-    set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
+  if(NOT DEPENDENCIES_FORCE_DOWNLOAD)
+    # Google Benchmark (https://github.com/google/benchmark.git)
+    find_package(benchmark QUIET)
+  endif()
+
+  if(NOT benchmark_FOUND)
+    message(STATUS "Google Benchmark not found or force download Google Benchmark on. Downloading and building Google Benchmark.")
+    if(CMAKE_CONFIGURATION_TYPES)
+      message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.")
+    endif()
+    set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/googlebenchmark CACHE PATH "")
+    if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
+      # hip-clang cannot compile googlebenchmark for some reason
+      set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
+    endif()
+
+    download_project(
+      PROJ           googlebenchmark
+      GIT_REPOSITORY https://github.com/google/benchmark.git
+      GIT_TAG        v1.4.0
+      INSTALL_DIR    ${GOOGLEBENCHMARK_ROOT}
+      CMAKE_ARGS     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE}
+      LOG_DOWNLOAD   TRUE
+      LOG_CONFIGURE  TRUE
+      LOG_BUILD      TRUE
+      LOG_INSTALL    TRUE
+      BUILD_PROJECT  TRUE
+      ${UPDATE_DISCONNECTED_IF_AVAILABLE}
+    )
   endif()
-  # Download, build and install googlebenchmark library
-  set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/googlebenchmark CACHE PATH "")
-  download_project(
-    PROJ           googlebenchmark
-    GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG        v1.4.0
-    INSTALL_DIR    ${GOOGLEBENCHMARK_ROOT}
-    CMAKE_ARGS     -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE}
-    LOG_DOWNLOAD   TRUE
-    LOG_CONFIGURE  TRUE
-    LOG_BUILD      TRUE
-    LOG_INSTALL    TRUE
-    BUILD_PROJECT  TRUE
-    ${UPDATE_DISCONNECTED_IF_AVAILABLE}
-  )
   find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT})
 endif()
 
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 19844cbb7..9d7e9accf 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -25,14 +25,18 @@ function(print_configuration_summary)
   message(STATUS "******** Summary ********")
   message(STATUS "General:")
   message(STATUS "  System                : ${CMAKE_SYSTEM_NAME}")
-  message(STATUS "  HIP ROOT              : ${HIP_ROOT_DIR}")
+  if(NOT USE_HIP_CPU)
+    message(STATUS "  HIP ROOT              : ${HIP_ROOT_DIR}")
+  endif()
   message(STATUS "  C++ compiler          : ${CMAKE_CXX_COMPILER}")
   message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
   string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP)
   message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS_STRIP}")
   message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
   message(STATUS "  Install prefix        : ${CMAKE_INSTALL_PREFIX}")
-  message(STATUS "  Device targets        : ${AMDGPU_TARGETS}")
+  if(NOT USE_HIP_CPU)
+    message(STATUS "  Device targets        : ${AMDGPU_TARGETS}")
+  endif()
   message(STATUS "")
   message(STATUS "  DISABLE_WERROR            : ${DISABLE_WERROR}")
   message(STATUS "  ONLY_INSTALL              : ${ONLY_INSTALL}")
@@ -40,4 +44,5 @@ function(print_configuration_summary)
   message(STATUS "  BUILD_BENCHMARK           : ${BUILD_BENCHMARK}")
   message(STATUS "  BUILD_EXAMPLE             : ${BUILD_EXAMPLE}")
   message(STATUS "  BUILD_ADDRESS_SANITIZER   : ${BUILD_ADDRESS_SANITIZER}")
+  message(STATUS "  USE_HIP_CPU               : ${USE_HIP_CPU}")
 endfunction()
diff --git a/example/example_temporary_storage.cpp b/example/example_temporary_storage.cpp
index 3cf61a70a..9c523dbec 100644
--- a/example/example_temporary_storage.cpp
+++ b/example/example_temporary_storage.cpp
@@ -37,7 +37,7 @@ __launch_bounds__(BlockSize)
 void example_shared_memory(const T *input, T *output)
 {
     // Indexing for  this block
-    unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
 
     // Allocating storage in shared memory for the block
     using block_scan_type = rocprim::block_scan<T, BlockSize>;
@@ -137,7 +137,7 @@ void example_union_storage_types(const T *input, T *output)
     } storage;
 
     constexpr int items_per_block = BlockSize * ItemsPerThread;
-    int block_offset = (hipBlockIdx_x * items_per_block);
+    int block_offset = (blockIdx.x * items_per_block);
 
     // Input/output array for block scan primitive
     T values[ItemsPerThread];
@@ -226,7 +226,7 @@ __launch_bounds__(BlockSize)
 void example_dynamic_shared_memory(const T *input, T *output)
 {
     // Indexing for  this block
-    unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
 
     // Initialize primitives
     using block_scan_type = rocprim::block_scan<T, BlockSize>;
@@ -310,7 +310,7 @@ void example_global_memory_storage(
         typename rocprim::block_scan<T, BlockSize>::storage_type *global_storage)
 {
     // Indexing for  this block
-    unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     // specialize block_scan for type T and block of 256 threads
     using block_scan_type = rocprim::block_scan<T, BlockSize>;
     // Variables required for performing a scan
@@ -322,7 +322,7 @@ void example_global_memory_storage(
     block_scan_type()
         .inclusive_scan(
             input_value, output_value,
-            global_storage[hipBlockIdx_x],
+            global_storage[blockIdx.x],
             rocprim::plus<T>()
         );
 
diff --git a/rocprim/include/rocprim/block/block_histogram.hpp b/rocprim/include/rocprim/block/block_histogram.hpp
index 3d7df08bb..4684967a4 100644
--- a/rocprim/include/rocprim/block/block_histogram.hpp
+++ b/rocprim/include/rocprim/block/block_histogram.hpp
@@ -161,7 +161,7 @@ class block_histogram
     {
         const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
         {
             const unsigned int offset_tid = offset + flat_tid;
diff --git a/rocprim/include/rocprim/block/block_load.hpp b/rocprim/include/rocprim/block/block_load.hpp
index 856b05892..d560a9ee8 100644
--- a/rocprim/include/rocprim/block/block_load.hpp
+++ b/rocprim/include/rocprim/block/block_load.hpp
@@ -110,7 +110,7 @@ enum class block_load_method
 /// \code{.cpp}
 /// __global__ void example_kernel(int * input, ...)
 /// {
-///     const int offset = hipBlockIdx_x * 128 * 8;
+///     const int offset = blockIdx.x * 128 * 8;
 ///     int items[8];
 ///     rocprim::block_load<int, 128, 8, load_method> blockload;
 ///     blockload.load(input + offset, items);
@@ -403,10 +403,10 @@ class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_ve
 
     ROCPRIM_DEVICE inline
     void load(T* block_input,
-              T (&items)[ItemsPerThread])
+              T (&_items)[ItemsPerThread])
     {
         const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
-        block_load_direct_blocked_vectorized(flat_id, block_input, items);
+        block_load_direct_blocked_vectorized(flat_id, block_input, _items);
     }
 
     template<class InputIterator, class U>
diff --git a/rocprim/include/rocprim/block/block_load_func.hpp b/rocprim/include/rocprim/block/block_load_func.hpp
index bf87c16b8..2a50d073f 100644
--- a/rocprim/include/rocprim/block/block_load_func.hpp
+++ b/rocprim/include/rocprim/block/block_load_func.hpp
@@ -61,7 +61,7 @@ void block_load_direct_blocked(unsigned int flat_id,
 {
     unsigned int offset = flat_id * ItemsPerThread;
     InputIterator thread_iter = block_input + offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = thread_iter[item];
@@ -98,7 +98,7 @@ void block_load_direct_blocked(unsigned int flat_id,
 {
     unsigned int offset = flat_id * ItemsPerThread;
     InputIterator thread_iter = block_input + offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         if (item + offset < valid)
@@ -141,11 +141,12 @@ void block_load_direct_blocked(unsigned int flat_id,
                                unsigned int valid,
                                Default out_of_bounds)
 {
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
-        items[item] = out_of_bounds;
+        items[item] = static_cast<T>(out_of_bounds);
     }
+    // TODO: Consider using std::fill for HIP-CPU, as uses memset() where appropriate
 
     block_load_direct_blocked(flat_id, block_input, items, valid);
 }
@@ -181,10 +182,10 @@ template<
     unsigned int ItemsPerThread
 >
 ROCPRIM_DEVICE inline
-typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>()>::type
+auto
 block_load_direct_blocked_vectorized(unsigned int flat_id,
                                      T* block_input,
-                                     U (&items)[ItemsPerThread])
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
 {
     typedef typename detail::match_vector_type<T, ItemsPerThread>::type vector_type;
     constexpr unsigned int vectors_per_thread = (sizeof(T) * ItemsPerThread) / sizeof(vector_type);
@@ -193,13 +194,13 @@ block_load_direct_blocked_vectorized(unsigned int flat_id,
     const vector_type* vector_ptr = reinterpret_cast<const vector_type*>(block_input) +
         (flat_id * vectors_per_thread);
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < vectors_per_thread; item++)
     {
         vector_items[item] = *(vector_ptr + item);
     }
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = *(reinterpret_cast<T*>(vector_items) + item);
@@ -212,10 +213,10 @@ template<
     unsigned int ItemsPerThread
 >
 ROCPRIM_DEVICE inline
-typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>()>::type
+auto
 block_load_direct_blocked_vectorized(unsigned int flat_id,
                                      T* block_input,
-                                     U (&items)[ItemsPerThread])
+                                     U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
 {
     block_load_direct_blocked(flat_id, block_input, items);
 }
@@ -249,7 +250,7 @@ void block_load_direct_striped(unsigned int flat_id,
                                T (&items)[ItemsPerThread])
 {
     InputIterator thread_iter = block_input + flat_id;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = thread_iter[item * BlockSize];
@@ -287,7 +288,7 @@ void block_load_direct_striped(unsigned int flat_id,
                                unsigned int valid)
 {
     InputIterator thread_iter = block_input + flat_id;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         unsigned int offset = item * BlockSize;
@@ -333,7 +334,7 @@ void block_load_direct_striped(unsigned int flat_id,
                                unsigned int valid,
                                Default out_of_bounds)
 {
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = out_of_bounds;
@@ -385,7 +386,7 @@ void block_load_direct_warp_striped(unsigned int flat_id,
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
 
     InputIterator thread_iter = block_input + thread_id + warp_offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = thread_iter[item * WarpSize];
@@ -437,7 +438,7 @@ void block_load_direct_warp_striped(unsigned int flat_id,
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
 
     InputIterator thread_iter = block_input + thread_id + warp_offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         unsigned int offset = item * WarpSize;
@@ -493,7 +494,7 @@ void block_load_direct_warp_striped(unsigned int flat_id,
     static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
                  "WarpSize must be a power of two and equal or less"
                  "than the size of hardware warp.");
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         items[item] = out_of_bounds;
diff --git a/rocprim/include/rocprim/block/block_radix_sort.hpp b/rocprim/include/rocprim/block/block_radix_sort.hpp
index 9b04860e3..40c2ffeba 100644
--- a/rocprim/include/rocprim/block/block_radix_sort.hpp
+++ b/rocprim/include/rocprim/block/block_radix_sort.hpp
@@ -109,6 +109,14 @@ class block_bit_plus_scan
             warp_scan_prefix_type().inclusive_scan(prefix, prefix, ::rocprim::plus<unsigned int>());
             storage_.warp_prefixes[flat_id] = prefix;
         }
+#ifdef __HIP_CPU_RT__
+        else
+        {
+            // HIP-CPU doesn't implement lockstep behavior. Need to invoke the same number sync ops in divergent branch.
+            empty_type empty;
+            ::rocprim::detail::warp_scan_crosslane<empty_type, detail::next_power_of_two(warps_no)>().inclusive_scan(empty, empty, empty_binary_op{});
+        }
+#endif
         ::rocprim::syncthreads();
 
         // Perform exclusive warp scan of bit values
@@ -207,7 +215,7 @@ class block_radix_sort
             typename bit_keys_exchange_type::storage_type bit_keys_exchange;
             typename values_exchange_type::storage_type values_exchange;
         };
-        typename bit_block_scan::storage_type bit_block_scan;
+        typename block_radix_sort<Key,BlockSizeX,ItemsPerThread,Value,BlockSizeY,BlockSizeZ>::bit_block_scan::storage_type bit_block_scan;
     };
 
 public:
@@ -893,6 +901,11 @@ class block_radix_sort
             }
 
             unsigned int ranks[ItemsPerThread];
+#ifdef __HIP_CPU_RT__
+            // TODO: Check if really necessary
+            // Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory
+            std::memset(ranks, 0, ItemsPerThread * sizeof(decltype(ranks[0])));
+#endif
             unsigned int count;
             bit_block_scan().exclusive_scan(bits, ranks, count, storage_.bit_block_scan);
 
diff --git a/rocprim/include/rocprim/block/block_reduce.hpp b/rocprim/include/rocprim/block/block_reduce.hpp
index df9ef6fb9..dc6d3cc67 100644
--- a/rocprim/include/rocprim/block/block_reduce.hpp
+++ b/rocprim/include/rocprim/block/block_reduce.hpp
@@ -176,7 +176,7 @@ class block_reduce
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_reduce for float and block of 256 threads
     ///     using block_reduce_f = rocprim::block_reduce<float, 256>;
@@ -257,7 +257,7 @@ class block_reduce
     /// each provides two \p long value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_reduce for long and block of 128 threads
     ///     using block_reduce_f = rocprim::block_reduce<long, 128>;
@@ -345,7 +345,7 @@ class block_reduce
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_reduce for float and block of 256 threads
     ///     using block_reduce_f = rocprim::block_reduce<float, 256>;
diff --git a/rocprim/include/rocprim/block/block_scan.hpp b/rocprim/include/rocprim/block/block_scan.hpp
index b30a3bdc6..ac11c45f1 100644
--- a/rocprim/include/rocprim/block/block_scan.hpp
+++ b/rocprim/include/rocprim/block/block_scan.hpp
@@ -171,7 +171,7 @@ class block_scan
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for float and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<float, 256>;
@@ -252,7 +252,7 @@ class block_scan
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for float and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<float, 256>;
@@ -363,7 +363,7 @@ class block_scan
     ///     }
     /// };
     ///
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for int and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<int, 256>;
@@ -429,7 +429,7 @@ class block_scan
     /// each provides two \p long value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for long and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<long, 128>;
@@ -532,7 +532,7 @@ class block_scan
     /// each provides two \p long value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for long and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<long, 128>;
@@ -664,7 +664,7 @@ class block_scan
     ///     }
     /// };
     ///
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for int and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<int, 128>;
@@ -739,7 +739,7 @@ class block_scan
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for float and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<float, 256>;
@@ -828,7 +828,7 @@ class block_scan
     /// each provides one \p float value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for float and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<float, 256>;
@@ -945,7 +945,7 @@ class block_scan
     ///     }
     /// };
     ///
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize block_scan for int and block of 256 threads
     ///     using block_scan_f = rocprim::block_scan<int, 256>;
@@ -1013,7 +1013,7 @@ class block_scan
     /// each provides two \p long value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for long and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<long, 128>;
@@ -1124,7 +1124,7 @@ class block_scan
     /// each provides two \p long value.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for long and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<long, 128>;
@@ -1263,7 +1263,7 @@ class block_scan
     ///     }
     /// };
     ///
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 128
+    /// __global__ void example_kernel(...) // blockDim.x = 128
     /// {
     ///     // specialize block_scan for int and block of 128 threads
     ///     using block_scan_f = rocprim::block_scan<int, 128>;
diff --git a/rocprim/include/rocprim/block/block_shuffle.hpp b/rocprim/include/rocprim/block/block_shuffle.hpp
index 979a691f9..b9122c165 100644
--- a/rocprim/include/rocprim/block/block_shuffle.hpp
+++ b/rocprim/include/rocprim/block/block_shuffle.hpp
@@ -309,7 +309,7 @@ class block_shuffle
 
         ::rocprim::syncthreads();
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for (unsigned int i = ItemsPerThread - 1; i > 0; --i)
         {
             prev[i] = input[i - 1];
@@ -424,7 +424,7 @@ class block_shuffle
 
         ::rocprim::syncthreads();
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
         {
           next[i] = input[i + 1];
diff --git a/rocprim/include/rocprim/block/block_store.hpp b/rocprim/include/rocprim/block/block_store.hpp
index 980171dfd..a449a6392 100644
--- a/rocprim/include/rocprim/block/block_store.hpp
+++ b/rocprim/include/rocprim/block/block_store.hpp
@@ -110,7 +110,7 @@ enum class block_store_method
 /// \code{.cpp}
 /// __global__ void kernel(int * output)
 /// {
-///     const int offset = hipBlockIdx_x * 128 * 8;
+///     const int offset = blockIdx.x * 128 * 8;
 ///     int items[8];
 ///     rocprim::block_store<int, 128, 8, store_method> blockstore;
 ///     blockstore.store(output + offset, items);
@@ -300,10 +300,10 @@ class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store
 
     ROCPRIM_DEVICE inline
     void store(T* block_output,
-               T (&items)[ItemsPerThread])
+               T (&_items)[ItemsPerThread])
     {
         const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
-        block_store_direct_blocked_vectorized(flat_id, block_output, items);
+        block_store_direct_blocked_vectorized(flat_id, block_output, _items);
     }
 
     template<class OutputIterator, class U>
diff --git a/rocprim/include/rocprim/block/block_store_func.hpp b/rocprim/include/rocprim/block/block_store_func.hpp
index 30eece97b..ff26aaabf 100644
--- a/rocprim/include/rocprim/block/block_store_func.hpp
+++ b/rocprim/include/rocprim/block/block_store_func.hpp
@@ -65,7 +65,7 @@ void block_store_direct_blocked(unsigned int flat_id,
 
     unsigned int offset = flat_id * ItemsPerThread;
     OutputIterator thread_iter = block_output + offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         thread_iter[item] = items[item];
@@ -106,7 +106,7 @@ void block_store_direct_blocked(unsigned int flat_id,
 
     unsigned int offset = flat_id * ItemsPerThread;
     OutputIterator thread_iter = block_output + offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         if (item + offset < valid)
@@ -147,10 +147,10 @@ template<
     unsigned int ItemsPerThread
 >
 ROCPRIM_DEVICE inline
-typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>()>::type
+auto
 block_store_direct_blocked_vectorized(unsigned int flat_id,
                                       T* block_output,
-                                      U (&items)[ItemsPerThread])
+                                      U (&items)[ItemsPerThread]) -> typename std::enable_if<detail::is_vectorizable<T, ItemsPerThread>::value>::type
 {
     static_assert(std::is_convertible<U, T>::value,
                   "The type U must be such that it can be implicitly converted to T.");
@@ -162,7 +162,7 @@ block_store_direct_blocked_vectorized(unsigned int flat_id,
     vector_type raw_vector_items[vectors_per_thread];
     T *raw_items = reinterpret_cast<T*>(raw_vector_items);
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         raw_items[item] = items[item];
@@ -177,10 +177,10 @@ template<
     unsigned int ItemsPerThread
 >
 ROCPRIM_DEVICE inline
-typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>()>::type
+auto
 block_store_direct_blocked_vectorized(unsigned int flat_id,
                                       T* block_output,
-                                      U (&items)[ItemsPerThread])
+                                      U (&items)[ItemsPerThread]) -> typename std::enable_if<!detail::is_vectorizable<T, ItemsPerThread>::value>::type
 {
     block_store_direct_blocked(flat_id, block_output, items);
 }
@@ -218,7 +218,7 @@ void block_store_direct_striped(unsigned int flat_id,
                   "can be dereferenced and assigned a value of type T.");
 
     OutputIterator thread_iter = block_output + flat_id;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
          thread_iter[item * BlockSize] = items[item];
@@ -260,7 +260,7 @@ void block_store_direct_striped(unsigned int flat_id,
                   "can be dereferenced and assigned a value of type T.");
 
     OutputIterator thread_iter = block_output + flat_id;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         unsigned int offset = item * BlockSize;
@@ -318,7 +318,7 @@ void block_store_direct_warp_striped(unsigned int flat_id,
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
 
     OutputIterator thread_iter = block_output + thread_id + warp_offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         thread_iter[item * WarpSize] = items[item];
@@ -374,7 +374,7 @@ void block_store_direct_warp_striped(unsigned int flat_id,
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
 
     OutputIterator thread_iter = block_output + thread_id + warp_offset;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {
         unsigned int offset = item * WarpSize;
diff --git a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
index b0d8a4c76..3a76def6f 100644
--- a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
+++ b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
@@ -63,7 +63,7 @@ class block_histogram_atomic
             std::is_same<Counter, float>::value || std::is_same<Counter, unsigned long long>::value,
             "Counter must be type that is supported by atomics (float, int, unsigned int, unsigned long long)"
         );
-        #pragma unroll
+        ROCPRIM_UNROLL
         for (unsigned int i = 0; i < ItemsPerThread; ++i)
         {
               ::rocprim::detail::atomic_add(&hist[static_cast<unsigned int>(input[i])], Counter(1));
diff --git a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp
index 56fae48d0..313b2c99a 100644
--- a/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp
+++ b/rocprim/include/rocprim/block/detail/block_histogram_sort.hpp
@@ -86,10 +86,11 @@ class block_histogram_sort
                    Counter hist[Bins],
                    storage_type& storage)
     {
-        static_assert(
-            std::is_convertible<unsigned int, Counter>::value,
-            "unsigned int must be convertible to Counter"
-        );
+        // TODO: Check, MSVC rejects the code with the static assertion, yet compiles fine for all tested types. Predicate likely too strict
+        //static_assert(
+        //    std::is_convertible<unsigned int, Counter>::value,
+        //    "unsigned int must be convertible to Counter"
+        //);
         constexpr auto tile_size = BlockSize * ItemsPerThread;
         const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
         unsigned int head_flags[ItemsPerThread];
@@ -99,7 +100,7 @@ class block_histogram_sort
         radix_sort().sort(input, storage_.sort);
         ::rocprim::syncthreads(); // Fix race condition that appeared on Vega10 hardware, storage LDS is reused below.
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
         {
             const unsigned int offset_tid = offset + flat_tid;
@@ -121,7 +122,7 @@ class block_histogram_sort
         }
         ::rocprim::syncthreads();
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 0; offset < Bins; offset += BlockSize)
         {
             const unsigned int offset_tid = offset + flat_tid;
diff --git a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp
index 82176694e..505649c70 100644
--- a/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp
+++ b/rocprim/include/rocprim/block/detail/block_reduce_raking_reduce.hpp
@@ -117,7 +117,7 @@ class block_reduce_raking_reduce
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = reduce_op(thread_input, input[i]);
diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp
index c020c1b3e..d8485a855 100644
--- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp
+++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp
@@ -106,7 +106,7 @@ class block_reduce_warp_reduce
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = reduce_op(thread_input, input[i]);
diff --git a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp
index fbbe29c59..b8e2e17d5 100644
--- a/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp
+++ b/rocprim/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp
@@ -145,7 +145,7 @@ class block_scan_reduce_then_scan
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -164,7 +164,7 @@ class block_scan_reduce_then_scan
         output[0] = input[0];
         if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]);
         // Final thread-local scan
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             output[i] = scan_op(output[i-1], input[i]);
@@ -221,7 +221,7 @@ class block_scan_reduce_then_scan
         storage_type_& storage_ = storage.get();
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -249,7 +249,7 @@ class block_scan_reduce_then_scan
         // Include block prefix
         output[0] = scan_op(block_prefix, output[0]);
         // Final thread-local scan
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             output[i] = scan_op(output[i-1], input[i]);
@@ -344,7 +344,7 @@ class block_scan_reduce_then_scan
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -368,7 +368,7 @@ class block_scan_reduce_then_scan
             exclusive = thread_input;
         }
         output[0] = exclusive;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             exclusive = scan_op(exclusive, prev);
@@ -430,7 +430,7 @@ class block_scan_reduce_then_scan
         storage_type_& storage_ = storage.get();
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -460,7 +460,7 @@ class block_scan_reduce_then_scan
             exclusive = scan_op(block_prefix, thread_input);
         }
         output[0] = exclusive;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             exclusive = scan_op(exclusive, prev);
@@ -507,7 +507,7 @@ class block_scan_reduce_then_scan
             const unsigned int idx_end = idx_start + thread_reduction_size_;
 
             T thread_reduction = storage_.threads[idx_start];
-            #pragma unroll
+            ROCPRIM_UNROLL
             for(unsigned int i = idx_start + 1; i < idx_end; i++)
             {
                 thread_reduction = scan_op(
@@ -527,7 +527,7 @@ class block_scan_reduce_then_scan
             }
 
             storage_.threads[idx_start] = thread_reduction;
-            #pragma unroll
+            ROCPRIM_UNROLL
             for(unsigned int i = idx_start + 1; i < idx_end; i++)
             {
                 thread_reduction = scan_op(
diff --git a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp
index 84b8a6f2a..34762ee01 100644
--- a/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp
+++ b/rocprim/include/rocprim/block/detail/block_scan_warp_scan.hpp
@@ -157,7 +157,7 @@ class block_scan_warp_scan
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -179,7 +179,7 @@ class block_scan_warp_scan
             output[0] = scan_op(thread_input, input[0]);
         }
         // Final thread-local scan
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             output[i] = scan_op(output[i-1], input[i]);
@@ -236,7 +236,7 @@ class block_scan_warp_scan
         storage_type_& storage_ = storage.get();
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -267,7 +267,7 @@ class block_scan_warp_scan
         // Include block prefix
         output[0] = scan_op(block_prefix, output[0]);
         // Final thread-local scan
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             output[i] = scan_op(output[i-1], input[i]);
@@ -366,7 +366,7 @@ class block_scan_warp_scan
     {
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -391,7 +391,7 @@ class block_scan_warp_scan
         }
         output[0] = exclusive;
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             exclusive = scan_op(exclusive, prev);
@@ -453,7 +453,7 @@ class block_scan_warp_scan
         storage_type_& storage_ = storage.get();
         // Reduce thread items
         T thread_input = input[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             thread_input = scan_op(thread_input, input[i]);
@@ -484,7 +484,7 @@ class block_scan_warp_scan
         }
         output[0] = exclusive;
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ItemsPerThread; i++)
         {
             exclusive = scan_op(exclusive, prev);
diff --git a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp
index 16060a79d..84180e548 100644
--- a/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp
+++ b/rocprim/include/rocprim/block/detail/block_sort_bitonic.hpp
@@ -240,11 +240,11 @@ class block_sort_bitonic
             };
         wsort.sort(kv..., compare_function2);
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int length = ::rocprim::device_warp_size(); length < Size; length *= 2)
         {
             bool dir = (flat_tid & (length * 2)) != 0;
-            #pragma unroll
+            ROCPRIM_UNROLL
             for(unsigned int k = length; k > 0; k /= 2)
             {
                 copy_to_shared(kv..., flat_tid, storage);
@@ -301,7 +301,7 @@ class block_sort_bitonic
         unsigned int odd_id = (is_even) ? ::rocprim::max(flat_tid, 1u) - 1 : ::rocprim::min(flat_tid + 1, Size - 1);
         unsigned int even_id = (is_even) ? ::rocprim::min(flat_tid + 1, Size - 1) : ::rocprim::max(flat_tid, 1u) - 1;
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int length = 0; length < Size; length++)
         {
             unsigned int next_id = (length % 2) == 0 ? even_id : odd_id;
diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp
index 0a9f6197e..caa108a31 100644
--- a/rocprim/include/rocprim/config.hpp
+++ b/rocprim/include/rocprim/config.hpp
@@ -93,4 +93,12 @@
 #define ROCPRIM_WARP_SIZE_64 64u
 #define ROCPRIM_MAX_WARP_SIZE ROCPRIM_WARP_SIZE_64
 
+#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__GNUC__)
+#define ROCPRIM_UNROLL
+#define ROCPRIM_NO_UNROLL
+#else
+#define ROCPRIM_UNROLL _Pragma("unroll")
+#define ROCPRIM_NO_UNROLL _Pragma("nounroll")
+#endif
+
 #endif // ROCPRIM_CONFIG_HPP_
diff --git a/rocprim/include/rocprim/detail/various.hpp b/rocprim/include/rocprim/detail/various.hpp
index 613f57ea4..d98f653af 100644
--- a/rocprim/include/rocprim/detail/various.hpp
+++ b/rocprim/include/rocprim/detail/various.hpp
@@ -127,12 +127,7 @@ struct match_vector_type
 
 // Checks if Items is odd and ensures that size of T is smaller than vector_type.
 template<class T, unsigned int Items>
-ROCPRIM_HOST_DEVICE
-constexpr bool is_vectorizable()
-{
-    return (Items % 2 == 0) &&
-           (sizeof(T) < sizeof(typename match_vector_type<T, Items>::type));
-}
+struct is_vectorizable : std::integral_constant<bool, (Items % 2 == 0) &&(sizeof(T) < sizeof(typename match_vector_type<T, Items>::type))> {};
 
 // Returns the number of LDS (local data share) banks.
 ROCPRIM_HOST_DEVICE
@@ -168,7 +163,13 @@ ROCPRIM_DEVICE inline
 auto store_volatile(T * output, T value)
     -> typename std::enable_if<std::is_fundamental<T>::value>::type
 {
+    // TODO: check GCC
+    // error: binding reference of type ‘const half_float::half&’ to ‘volatile half_float::half’ discards qualifiers
+#if !(defined(__HIP_CPU_RT__ ) && defined(__GNUC__))
     *const_cast<volatile T*>(output) = value;
+#else
+    *output = value;
+#endif
 }
 
 template<class T>
@@ -182,7 +183,7 @@ auto store_volatile(T * output, T value)
     auto input_ptr = reinterpret_cast<volatile fundamental_type*>(&value);
     auto output_ptr = reinterpret_cast<volatile fundamental_type*>(output);
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < n; i++)
     {
         output_ptr[i] = input_ptr[i];
@@ -194,8 +195,14 @@ ROCPRIM_DEVICE inline
 auto load_volatile(T * input)
     -> typename std::enable_if<std::is_fundamental<T>::value, T>::type
 {
+    // TODO: check GCC
+    // error: binding reference of type ‘const half_float::half&’ to ‘volatile half_float::half’ discards qualifiers
+#if !(defined(__HIP_CPU_RT__ ) && defined(__GNUC__))
     T retval = *const_cast<volatile T*>(input);
     return retval;
+#else
+    return *input;
+#endif
 }
 
 template<class T>
@@ -210,7 +217,7 @@ auto load_volatile(T * input)
     auto output_ptr = reinterpret_cast<volatile fundamental_type*>(&retval);
     auto input_ptr = reinterpret_cast<volatile fundamental_type*>(input);
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < n; i++)
     {
         output_ptr[i] = input_ptr[i];
diff --git a/rocprim/include/rocprim/device/detail/device_merge.hpp b/rocprim/include/rocprim/device/detail/device_merge.hpp
index 29673ce58..5d3276cf0 100644
--- a/rocprim/include/rocprim/device/detail/device_merge.hpp
+++ b/rocprim/include/rocprim/device/detail/device_merge.hpp
@@ -161,7 +161,7 @@ void load(unsigned int flat_id,
           const size_t input1_size,
           const size_t input2_size)
 {
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < ItemsPerThread; ++i)
     {
         unsigned int index = BlockSize * i + flat_id;
@@ -193,7 +193,7 @@ void serial_merge(KeyType * keys_shared,
     KeyType a = keys_shared[range.begin1];
     KeyType b = keys_shared[range.begin2];
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < ItemsPerThread; ++i)
     {
         bool compare = (range.begin2 >= range.end2) ||
@@ -298,7 +298,7 @@ merge_values(unsigned int flat_id,
 
     if(count >= ItemsPerThread * BlockSize)
     {
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; ++i)
         {
             values[i] = (index[i] < input1_size) ? values_input1[index[i]] :
@@ -307,7 +307,7 @@ merge_values(unsigned int flat_id,
     }
     else
     {
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; ++i)
         {
             if(flat_id * ItemsPerThread + i < count)
diff --git a/rocprim/include/rocprim/device/detail/device_partition.hpp b/rocprim/include/rocprim/device/detail/device_partition.hpp
index 582fe70a8..cd1039362 100644
--- a/rocprim/include/rocprim/device/detail/device_partition.hpp
+++ b/rocprim/include/rocprim/device/detail/device_partition.hpp
@@ -182,7 +182,7 @@ auto partition_block_load_flags(InputIterator /* block_predecessor */,
     if(is_last_block) // last block
     {
         const auto offset = block_thread_id * ItemsPerThread;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             if((offset + i) < valid_in_last_block)
@@ -197,7 +197,7 @@ auto partition_block_load_flags(InputIterator /* block_predecessor */,
     }
     else
     {
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             is_selected[i] = predicate(values[i]);
@@ -314,7 +314,7 @@ auto partition_block_load_flags(InputIterator block_predecessor,
     if(is_last_block)
     {
         const auto offset = block_thread_id * ItemsPerThread;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             if((offset + i) >= valid_in_last_block)
@@ -354,7 +354,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread],
 
     // Scatter selected/rejected values to shared memory
     auto scatter_storage = storage.get();
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < ItemsPerThread; i++)
     {
         unsigned int item_index = (flat_block_thread_id * ItemsPerThread) + i;
@@ -366,7 +366,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread],
     }
     ::rocprim::syncthreads(); // sync threads to reuse shared memory
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < ItemsPerThread; i++)
     {
         unsigned int item_index = (i * BlockSize) + flat_block_thread_id;
@@ -421,7 +421,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread],
     {
         // Scatter selected values to shared memory
         auto scatter_storage = storage.get();
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             unsigned int scatter_index = output_indices[i] - selected_prefix;
@@ -440,7 +440,7 @@ auto partition_scatter(ValueType (&values)[ItemsPerThread],
     }
     else
     {
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             if(!is_last_block || output_indices[i] < (selected_prefix + selected_in_block))
@@ -578,7 +578,7 @@ void partition_kernel_impl(InputIterator input,
     );
 
     // Convert true/false is_selected flags to 0s and 1s
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(unsigned int i = 0; i < items_per_thread; i++)
     {
         output_indices[i] = is_selected[i] ? 1 : 0;
diff --git a/rocprim/include/rocprim/device/detail/device_reduce.hpp b/rocprim/include/rocprim/device/detail/device_reduce.hpp
index a234aa943..294ed357c 100644
--- a/rocprim/include/rocprim/device/detail/device_reduce.hpp
+++ b/rocprim/include/rocprim/device/detail/device_reduce.hpp
@@ -116,7 +116,7 @@ void block_reduce_kernel_impl(InputIterator input,
         );
 
         output_value = values[0];
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < items_per_thread; i++)
         {
             unsigned int offset = i * block_size;
diff --git a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp
index 38d64cdde..dc75b9933 100644
--- a/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/detail/device_segmented_radix_sort.hpp
@@ -68,8 +68,8 @@ class segmented_radix_sort_helper
 
     union storage_type
     {
-        typename count_helper_type::storage_type count_helper;
-        typename sort_and_scatter_helper::storage_type sort_and_scatter_helper;
+        typename segmented_radix_sort_helper<Key, Value, WarpSize, BlockSize, ItemsPerThread, RadixBits, Descending>::count_helper_type::storage_type count_helper;
+        typename segmented_radix_sort_helper<Key, Value, WarpSize, BlockSize, ItemsPerThread, RadixBits, Descending>::sort_and_scatter_helper::storage_type sort_and_scatter_helper;
     };
 
     template<
@@ -517,9 +517,9 @@ void segmented_sort(KeysInputIterator keys_input,
 
     ROCPRIM_SHARED_MEMORY union
     {
-        typename single_block_helper::storage_type single_block_helper;
-        typename long_radix_helper_type::storage_type long_radix_helper;
-        typename short_radix_helper_type::storage_type short_radix_helper;
+        typename rocprim::detail::segmented_radix_sort_single_block_helper<key_type, value_type, block_size, items_per_thread, Descending>::storage_type single_block_helper;
+        typename rocprim::detail::segmented_radix_sort_helper<key_type, value_type, ::rocprim::device_warp_size(), block_size, items_per_thread, long_radix_bits, Descending>::storage_type long_radix_helper;
+        typename rocprim::detail::segmented_radix_sort_helper<key_type, value_type, ::rocprim::device_warp_size(), block_size, items_per_thread, short_radix_bits, Descending>::storage_type short_radix_helper;
     } storage;
 
     const unsigned int segment_id = ::rocprim::detail::block_id<0>();
diff --git a/rocprim/include/rocprim/device/detail/device_transform.hpp b/rocprim/include/rocprim/device/detail/device_transform.hpp
index 593ff885e..74aefc16b 100644
--- a/rocprim/include/rocprim/device/detail/device_transform.hpp
+++ b/rocprim/include/rocprim/device/detail/device_transform.hpp
@@ -109,7 +109,7 @@ void transform_kernel_impl(InputIterator input,
             valid_in_last_block
         );
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             if(BlockSize * i + flat_id < valid_in_last_block)
@@ -133,7 +133,7 @@ void transform_kernel_impl(InputIterator input,
             input_values
         );
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; i++)
         {
             output_values[i] = transform_op(input_values[i]);
diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
index 65709d045..947e5a3ec 100644
--- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
+++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
@@ -122,7 +122,11 @@ struct lookback_scan_state<T, UseSleep, true>
             prefix_type prefix;
             prefix.flag = PREFIX_EMPTY;
             prefix_underlying_type p;
+#ifndef __HIP_CPU_RT__
             __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
+#else
+            std::memcpy(&p, &prefix, sizeof(prefix_type));
+#endif
             prefixes[padding + block_id] = p;
         }
         if(block_id < padding)
@@ -130,7 +134,11 @@ struct lookback_scan_state<T, UseSleep, true>
             prefix_type prefix;
             prefix.flag = PREFIX_INVALID;
             prefix_underlying_type p;
+#ifndef __HIP_CPU_RT__
             __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
+#else
+            std::memcpy(&p, &prefix, sizeof(prefix_type));
+#endif
             prefixes[block_id] = p;
         }
     }
@@ -155,23 +163,35 @@ struct lookback_scan_state<T, UseSleep, true>
 
         prefix_type prefix;
 
-        const uint SLEEP_MAX = 32;
-        uint times_through = 1;
+        const unsigned int SLEEP_MAX = 32;
+        unsigned int times_through = 1;
 
         prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
+#ifndef __HIP_CPU_RT__
         __builtin_memcpy(&prefix, &p, sizeof(prefix_type));
+#else
+        std::memcpy(&prefix, &p, sizeof(prefix_type));
+#endif
         while(prefix.flag == PREFIX_EMPTY)
         {
             if (UseSleep)
             {
-                for (uint j = 0; j < times_through; j++)
+                for (unsigned int j = 0; j < times_through; j++)
+#ifndef __HIP_CPU_RT__
                     __builtin_amdgcn_s_sleep(1);
+#else
+                    std::this_thread::sleep_for(std::chrono::microseconds{1});
+#endif
                 if (times_through < SLEEP_MAX)
                     times_through++;
             }
             // atomic_add(..., 0) is used to load values atomically
             prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
+#ifndef __HIP_CPU_RT__
             __builtin_memcpy(&prefix, &p, sizeof(prefix_type));
+#else
+            std::memcpy(&prefix, &p, sizeof(prefix_type));
+#endif
         }
 
         // return
@@ -187,7 +207,11 @@ struct lookback_scan_state<T, UseSleep, true>
 
         prefix_type prefix = { flag, value };
         prefix_underlying_type p;
+#ifndef __HIP_CPU_RT__
         __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
+#else
+        std::memcpy(&p, &prefix, sizeof(prefix_type));
+#endif
         ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p);
     }
 
@@ -273,8 +297,8 @@ struct lookback_scan_state<T, UseSleep, false>
     {
         constexpr unsigned int padding = ::rocprim::device_warp_size();
 
-        const uint SLEEP_MAX = 32;
-        uint times_through = 1;
+        const unsigned int SLEEP_MAX = 32;
+        unsigned int times_through = 1;
 
         flag = load_volatile(&prefixes_flags[padding + block_id]);
         ::rocprim::detail::memory_fence_device();
@@ -282,8 +306,12 @@ struct lookback_scan_state<T, UseSleep, false>
         {
             if (UseSleep)
             {
-                for (uint j = 0; j < times_through; j++)
+                for (unsigned int j = 0; j < times_through; j++)
+#ifndef __HIP_CPU_RT__
                     __builtin_amdgcn_s_sleep(1);
+#else
+                    std::this_thread::sleep_for(std::chrono::microseconds{1});
+#endif
                 if (times_through < SLEEP_MAX)
                     times_through++;
             }
diff --git a/rocprim/include/rocprim/device/device_histogram.hpp b/rocprim/include/rocprim/device/device_histogram.hpp
index 8cf9a6281..26c94d531 100644
--- a/rocprim/include/rocprim/device/device_histogram.hpp
+++ b/rocprim/include/rocprim/device/device_histogram.hpp
@@ -110,16 +110,16 @@ void histogram_global_kernel(SampleIterator samples,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
@@ -151,9 +151,9 @@ hipError_t histogram_impl(void * temporary_storage,
         default_histogram_config<ROCPRIM_TARGET_ARCH, sample_type, Channels, ActiveChannels>
     >;
 
-    constexpr unsigned int block_size = config::histogram::block_size;
-    constexpr unsigned int items_per_thread = config::histogram::items_per_thread;
-    constexpr unsigned int items_per_block = block_size * items_per_thread;
+    static constexpr unsigned int block_size = config::histogram::block_size;
+    static constexpr unsigned int items_per_thread = config::histogram::items_per_thread;
+    static constexpr unsigned int items_per_block = block_size * items_per_thread;
 
     if(row_stride_bytes % sizeof(sample_type) != 0)
     {
diff --git a/rocprim/include/rocprim/device/device_merge.hpp b/rocprim/include/rocprim/device/device_merge.hpp
index 46ecfbc35..16a118950 100644
--- a/rocprim/include/rocprim/device/device_merge.hpp
+++ b/rocprim/include/rocprim/device/device_merge.hpp
@@ -94,16 +94,16 @@ void merge_kernel(IndexIterator index,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
@@ -142,10 +142,10 @@ hipError_t merge_impl(void * temporary_storage,
         detail::default_merge_config<ROCPRIM_TARGET_ARCH, key_type, value_type>
     >;
 
-    constexpr unsigned int block_size = config::block_size;
-    constexpr unsigned int half_block = block_size / 2;
-    constexpr unsigned int items_per_thread = config::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int half_block = block_size / 2;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
 
     const unsigned int partitions = ((input1_size + input2_size) + items_per_block - 1) / items_per_block;
     const size_t partition_bytes = (partitions + 1) * sizeof(unsigned int);
diff --git a/rocprim/include/rocprim/device/device_merge_sort.hpp b/rocprim/include/rocprim/device/device_merge_sort.hpp
index 843c98d59..076b43aea 100644
--- a/rocprim/include/rocprim/device/device_merge_sort.hpp
+++ b/rocprim/include/rocprim/device/device_merge_sort.hpp
@@ -98,16 +98,16 @@ void block_merge_kernel(KeysInputIterator keys_input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
@@ -142,7 +142,7 @@ hipError_t merge_sort_impl(void * temporary_storage,
     >;
 
     // Block size
-    constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int block_size = config::block_size;
 
     const size_t keys_bytes = ::rocprim::detail::align_size(size * sizeof(key_type));
     const size_t values_bytes =
diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp
index 986021723..96839c0b2 100644
--- a/rocprim/include/rocprim/device/device_partition.hpp
+++ b/rocprim/include/rocprim/device/device_partition.hpp
@@ -96,16 +96,16 @@ void init_offset_scan_state_kernel(OffsetLookBackScanState offset_scan_state,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
@@ -149,9 +149,9 @@ hipError_t partition_impl(void * temporary_storage,
     using ordered_block_id_type = detail::ordered_block_id<unsigned int>;
 
 
-    constexpr unsigned int block_size = config::block_size;
-    constexpr unsigned int items_per_thread = config::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
     const unsigned int number_of_blocks =
         std::max(1u, static_cast<unsigned int>((size + items_per_block - 1)/items_per_block));
 
@@ -232,7 +232,7 @@ hipError_t partition_impl(void * temporary_storage,
             HIP_KERNEL_NAME(partition_kernel<
                 SelectMethod, OnlySelected, config,
                 InputIterator, FlagIterator, OutputIterator, SelectedCountOutputIterator,
-                UnaryPredicate, decltype(inequality_op), offset_scan_state_with_sleep_type
+                UnaryPredicate, InequalityOp, offset_scan_state_with_sleep_type
             >),
             dim3(grid_size), dim3(block_size), 0, stream,
             input, flags, output, selected_count_output, size, predicate,
@@ -244,7 +244,7 @@ hipError_t partition_impl(void * temporary_storage,
             HIP_KERNEL_NAME(partition_kernel<
                 SelectMethod, OnlySelected, config,
                 InputIterator, FlagIterator, OutputIterator, SelectedCountOutputIterator,
-                UnaryPredicate, decltype(inequality_op), offset_scan_state_type
+                UnaryPredicate, InequalityOp, offset_scan_state_type
             >),
             dim3(grid_size), dim3(block_size), 0, stream,
             input, flags, output, selected_count_output, size, predicate,
diff --git a/rocprim/include/rocprim/device/device_radix_sort.hpp b/rocprim/include/rocprim/device/device_radix_sort.hpp
index 5419ac0b1..d3f19b274 100644
--- a/rocprim/include/rocprim/device/device_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/device_radix_sort.hpp
@@ -127,16 +127,16 @@ void sort_and_scatter_kernel(KeysInputIterator keys_input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_radix_sort_config.hpp
index 1fe6940cb..84eb68651 100644
--- a/rocprim/include/rocprim/device/device_radix_sort_config.hpp
+++ b/rocprim/include/rocprim/device/device_radix_sort_config.hpp
@@ -155,6 +155,51 @@ struct radix_sort_config_900<Key, empty_type>
         select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 2>, kernel_config<256, 15> > >
     > { };
 
+
+template<class Key, class Value>
+struct radix_sort_config_908
+{
+    static constexpr unsigned int item_scale =
+        ::rocprim::detail::ceiling_div<unsigned int>(::rocprim::max(sizeof(Key), sizeof(Value)), sizeof(int));
+
+    using scan = kernel_config<256, 2>;
+
+    using type = select_type<
+        select_type_case<
+            (sizeof(Key) == 1 && sizeof(Value) <= 8),
+            radix_sort_config<4, 4, scan, kernel_config<256, 10> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 2 && sizeof(Value) <= 8),
+            radix_sort_config<6, 5, scan, kernel_config<256, 10> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 4 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 15> >
+        >,
+        select_type_case<
+            (sizeof(Key) == 8 && sizeof(Value) <= 8),
+            radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 14> >
+        >,
+        radix_sort_config<
+            6, 4, scan,
+            kernel_config<
+                limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                ::rocprim::max(1u, 15u / item_scale)
+            >
+        >
+    >;
+};
+
+template<class Key>
+struct radix_sort_config_908<Key, empty_type>
+    : select_type<
+        select_type_case<sizeof(Key) == 1, radix_sort_config<4, 3, kernel_config<256, 2>, kernel_config<256, 10> > >,
+        select_type_case<sizeof(Key) == 2, radix_sort_config<6, 5, kernel_config<256, 2>, kernel_config<256, 10> > >,
+        select_type_case<sizeof(Key) == 4, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 17> > >,
+        select_type_case<sizeof(Key) == 8, radix_sort_config<7, 6, kernel_config<256, 4>, kernel_config<256, 15> > >
+    > { };
+
 // TODO: We need to update these parameters
 template<class Key, class Value>
 struct radix_sort_config_90a
@@ -251,6 +296,7 @@ struct default_radix_sort_config
         TargetArch,
         select_arch_case<803, radix_sort_config_803<Key, Value> >,
         select_arch_case<900, radix_sort_config_900<Key, Value> >,
+        select_arch_case<908, radix_sort_config_908<Key, Value> >,
         select_arch_case<ROCPRIM_ARCH_90a, radix_sort_config_90a<Key, Value> >,
         select_arch_case<1030, radix_sort_config_1030<Key, Value> >,
         radix_sort_config_900<Key, Value>
diff --git a/rocprim/include/rocprim/device/device_reduce.hpp b/rocprim/include/rocprim/device/device_reduce.hpp
index 44fc9a6bf..9503d5585 100644
--- a/rocprim/include/rocprim/device/device_reduce.hpp
+++ b/rocprim/include/rocprim/device/device_reduce.hpp
@@ -65,25 +65,25 @@ void block_reduce_kernel(InputIterator input,
     if(debug_synchronous) \
     { \
         std::cout << name << "(" << size << ")"; \
-        auto error = hipStreamSynchronize(stream); \
-        if(error != hipSuccess) return error; \
-        auto end = std::chrono::high_resolution_clock::now(); \
-        auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-        std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+        auto _error = hipStreamSynchronize(stream); \
+        if(_error != hipSuccess) return _error; \
+        auto _end = std::chrono::high_resolution_clock::now(); \
+        auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+        std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
     }
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/device_reduce_by_key.hpp
index 1c0f732c0..9b0fb2e80 100644
--- a/rocprim/include/rocprim/device/device_reduce_by_key.hpp
+++ b/rocprim/include/rocprim/device/device_reduce_by_key.hpp
@@ -137,16 +137,16 @@ void scan_and_scatter_carry_outs_kernel(const carry_out<Result> * carry_outs,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_scan.hpp b/rocprim/include/rocprim/device/device_scan.hpp
index d2abcc852..363ec436e 100644
--- a/rocprim/include/rocprim/device/device_scan.hpp
+++ b/rocprim/include/rocprim/device/device_scan.hpp
@@ -160,16 +160,16 @@ void init_lookback_scan_state_kernel(LookBackScanState lookback_scan_state,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_scan_config.hpp b/rocprim/include/rocprim/device/device_scan_config.hpp
index 737399dd1..4d7603935 100644
--- a/rocprim/include/rocprim/device/device_scan_config.hpp
+++ b/rocprim/include/rocprim/device/device_scan_config.hpp
@@ -62,11 +62,11 @@ struct scan_config
     /// \brief Whether to use lookback scan or reduce-then-scan algorithm.
     static constexpr bool use_lookback = UseLookback;
     /// \brief Method for loading input values.
-    static constexpr block_load_method block_load_method = BlockLoadMethod;
+    static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
     /// \brief Method for storing values.
-    static constexpr block_store_method block_store_method = BlockStoreMethod;
+    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
     /// \brief Algorithm for block scan.
-    static constexpr block_scan_algorithm block_scan_method = BlockScanMethod;
+    static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
 };
 
 namespace detail
@@ -130,7 +130,7 @@ struct scan_config_1030
 
     using type = scan_config<
         limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_32>::value,
-        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::max(1u, 15u / item_scale),
         ROCPRIM_DETAIL_USE_LOOKBACK_SCAN,
         ::rocprim::block_load_method::block_load_transpose,
         ::rocprim::block_store_method::block_store_transpose,
diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
index e5855b538..2a93832d0 100644
--- a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
@@ -83,16 +83,16 @@ void segmented_sort_kernel(KeysInputIterator keys_input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/device_segmented_reduce.hpp
index 29261ea2f..55955381d 100644
--- a/rocprim/include/rocprim/device/device_segmented_reduce.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_reduce.hpp
@@ -67,16 +67,16 @@ void segmented_reduce_kernel(InputIterator input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_segmented_scan.hpp b/rocprim/include/rocprim/device/device_segmented_scan.hpp
index d1f1765ae..bcc175a25 100644
--- a/rocprim/include/rocprim/device/device_segmented_scan.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_scan.hpp
@@ -73,16 +73,16 @@ void segmented_scan_kernel(InputIterator input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            auto __error = hipStreamSynchronize(stream); \
+            if(__error != hipSuccess) return __error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
diff --git a/rocprim/include/rocprim/device/device_transform.hpp b/rocprim/include/rocprim/device/device_transform.hpp
index 2c381032f..994ae5c06 100644
--- a/rocprim/include/rocprim/device/device_transform.hpp
+++ b/rocprim/include/rocprim/device/device_transform.hpp
@@ -63,16 +63,16 @@ void transform_kernel(InputIterator input,
 
 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
     { \
-        auto error = hipPeekAtLastError(); \
-        if(error != hipSuccess) return error; \
+        auto _error = hipPeekAtLastError(); \
+        if(_error != hipSuccess) return _error; \
         if(debug_synchronous) \
         { \
             std::cout << name << "(" << size << ")"; \
-            auto error = hipStreamSynchronize(stream); \
-            if(error != hipSuccess) return error; \
-            auto end = std::chrono::high_resolution_clock::now(); \
-            auto d = std::chrono::duration_cast<std::chrono::duration<double>>(end - start); \
-            std::cout << " " << d.count() * 1000 << " ms" << '\n'; \
+            _error = hipStreamSynchronize(stream); \
+            if(_error != hipSuccess) return _error; \
+            auto _end = std::chrono::high_resolution_clock::now(); \
+            auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
+            std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
         } \
     }
 
@@ -158,9 +158,9 @@ hipError_t transform(InputIterator input,
         detail::default_transform_config<ROCPRIM_TARGET_ARCH, result_type>
     >;
 
-    constexpr unsigned int block_size = config::block_size;
-    constexpr unsigned int items_per_thread = config::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr unsigned int block_size = config::block_size;
+    static constexpr unsigned int items_per_thread = config::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
 
     // Start point for time measurements
     std::chrono::high_resolution_clock::time_point start;
diff --git a/rocprim/include/rocprim/functional.hpp b/rocprim/include/rocprim/functional.hpp
index fcd41a998..5d7fde783 100644
--- a/rocprim/include/rocprim/functional.hpp
+++ b/rocprim/include/rocprim/functional.hpp
@@ -33,9 +33,9 @@ BEGIN_ROCPRIM_NAMESPACE
 
 #define ROCPRIM_PRINT_ERROR_ONCE(message) \
 {                                          \
-    unsigned int idx = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x); \
-    idx += hipThreadIdx_y + (hipBlockIdx_y * hipBlockDim_y);             \
-    idx += hipThreadIdx_z + (hipBlockIdx_z * hipBlockDim_z);             \
+    unsigned int idx = threadIdx.x + (blockIdx.x * blockDim.x); \
+    idx += threadIdx.y + (blockIdx.y * blockDim.y);             \
+    idx += threadIdx.z + (blockIdx.z * blockDim.z);             \
     if (idx == 0)                                                        \
         printf("%s\n", #message);                                        \
 }
diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp
index 2ff0e9ef7..155e6eb98 100644
--- a/rocprim/include/rocprim/intrinsics/thread.hpp
+++ b/rocprim/include/rocprim/intrinsics/thread.hpp
@@ -76,7 +76,7 @@ constexpr unsigned int device_warp_size()
 ROCPRIM_DEVICE inline
 unsigned int flat_block_size()
 {
-    return hipBlockDim_z * hipBlockDim_y * hipBlockDim_x;
+    return blockDim.z * blockDim.y * blockDim.x;
 }
 
 /// \brief Returns flat size of a multidimensional tile (block).
@@ -92,16 +92,21 @@ unsigned int flat_tile_size()
 ROCPRIM_DEVICE inline
 unsigned int lane_id()
 {
+#ifndef __HIP_CPU_RT__
     return ::__lane_id();
+#else
+    using namespace hip::detail;
+    return id(Fiber::this_fiber()) % warpSize;
+#endif
 }
 
 /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile).
 ROCPRIM_DEVICE inline
 unsigned int flat_block_thread_id()
 {
-    return (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x)
-        + (hipThreadIdx_y * hipBlockDim_x)
-        + hipThreadIdx_x;
+    return (threadIdx.z * blockDim.y * blockDim.x)
+        + (threadIdx.y * blockDim.x)
+        + threadIdx.x;
 }
 
 /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional block (tile). Use template parameters to optimize 1D or 2D kernels.
@@ -110,7 +115,7 @@ ROCPRIM_DEVICE inline
 auto flat_block_thread_id()
     -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
 {
-    return hipThreadIdx_x;
+    return threadIdx.x;
 }
 
 template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
@@ -118,7 +123,7 @@ ROCPRIM_DEVICE inline
 auto flat_block_thread_id()
     -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
 {
-    return hipThreadIdx_x + (hipThreadIdx_y * hipBlockDim_x);
+    return threadIdx.x + (threadIdx.y * blockDim.x);
 }
 
 template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
@@ -126,8 +131,8 @@ ROCPRIM_DEVICE inline
 auto flat_block_thread_id()
     -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
 {
-    return hipThreadIdx_x + (hipThreadIdx_y * hipBlockDim_x) +
-           (hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x);
+    return threadIdx.x + (threadIdx.y * blockDim.x) +
+           (threadIdx.z * blockDim.y * blockDim.x);
 }
 
 /// \brief Returns flat (linear, 1D) thread identifier in a multidimensional tile (block).
@@ -162,9 +167,9 @@ unsigned int warp_id()
 ROCPRIM_DEVICE inline
 unsigned int flat_block_id()
 {
-    return (hipBlockIdx_z * hipGridDim_y * hipGridDim_x)
-        + (hipBlockIdx_y * hipGridDim_x)
-        + hipBlockIdx_x;
+    return (blockIdx.z * gridDim.y * gridDim.x)
+        + (blockIdx.y * gridDim.x)
+        + blockIdx.x;
 }
 
 template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
@@ -172,7 +177,7 @@ ROCPRIM_DEVICE inline
 auto flat_block_id()
     -> typename std::enable_if<(BlockSizeY == 1 && BlockSizeZ == 1), unsigned int>::type
 {
-    return hipBlockIdx_x;
+    return blockIdx.x;
 }
 
 template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
@@ -180,7 +185,7 @@ ROCPRIM_DEVICE inline
 auto flat_block_id()
     -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ == 1), unsigned int>::type
 {
-    return hipBlockIdx_x + (hipBlockIdx_y * hipGridDim_x);
+    return blockIdx.x + (blockIdx.y * gridDim.x);
 }
 
 template<unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
@@ -188,8 +193,8 @@ ROCPRIM_DEVICE inline
 auto flat_block_id()
     -> typename std::enable_if<(BlockSizeY > 1 && BlockSizeZ > 1), unsigned int>::type
 {
-    return hipBlockIdx_x + (hipBlockIdx_y * hipGridDim_x) +
-           (hipBlockIdx_z * hipGridDim_y * hipGridDim_x);
+    return blockIdx.x + (blockIdx.y * gridDim.x) +
+           (blockIdx.z * gridDim.y * gridDim.x);
 }
 
 // Sync
@@ -243,7 +248,7 @@ namespace detail
         return 0;
     }
 
-    #define ROCPRIM_DETAIL_CONCAT(A, B) A ## B
+    #define ROCPRIM_DETAIL_CONCAT(A, B) A B
     #define ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, dim, suffix) \
         template<> \
         ROCPRIM_DEVICE inline \
@@ -256,10 +261,10 @@ namespace detail
         ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 1, y) \
         ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC(name, prefix, 2, z)
 
-    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, hipThreadIdx_)
-    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, hipBlockIdx_)
-    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, hipBlockDim_)
-    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, hipGridDim_)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_thread_id, threadIdx.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_id, blockIdx.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(block_size, blockDim.)
+    ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS(grid_size, gridDim.)
 
     #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNCS
     #undef ROCPRIM_DETAIL_DEFINE_HIP_API_ID_FUNC
diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp
index 4814bb227..67872821b 100644
--- a/rocprim/include/rocprim/intrinsics/warp.hpp
+++ b/rocprim/include/rocprim/intrinsics/warp.hpp
@@ -47,20 +47,27 @@ ROCPRIM_DEVICE inline
 unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0)
 {
     int c;
-    #if __AMDGCN_WAVEFRONT_SIZE == 32
-        #ifdef __HIP__
-        c = ::__builtin_amdgcn_mbcnt_lo(x, add);
+    #ifndef __HIP_CPU_RT__
+        #if __AMDGCN_WAVEFRONT_SIZE == 32
+            #ifdef __HIP__
+            c = ::__builtin_amdgcn_mbcnt_lo(x, add);
+            #else
+            c = ::__mbcnt_lo(x, add);
+            #endif
         #else
-        c = ::__mbcnt_lo(x, add);
+            #ifdef __HIP__
+            c = ::__builtin_amdgcn_mbcnt_lo(static_cast<int>(x), add);
+            c = ::__builtin_amdgcn_mbcnt_hi(static_cast<int>(x >> 32), c);
+            #else
+            c = ::__mbcnt_lo(static_cast<int>(x), add);
+            c = ::__mbcnt_hi(static_cast<int>(x >> 32), c);
+            #endif
         #endif
     #else
-        #ifdef __HIP__
-        c = ::__builtin_amdgcn_mbcnt_lo(static_cast<int>(x), add);
-        c = ::__builtin_amdgcn_mbcnt_hi(static_cast<int>(x >> 32), c);
-        #else
-        c = ::__mbcnt_lo(static_cast<int>(x), add);
-        c = ::__mbcnt_hi(static_cast<int>(x >> 32), c);
-        #endif
+        using namespace hip::detail;
+        const auto tidx{id(Fiber::this_fiber()) % warpSize};
+        std::bitset<warpSize> bits{x >> (warpSize - tidx)};
+        c = static_cast<unsigned int>(bits.count()) + add;
     #endif
     return c;
 }
@@ -71,13 +78,37 @@ namespace detail
 ROCPRIM_DEVICE inline
 int warp_any(int predicate)
 {
+#ifndef __HIP_CPU_RT__
     return ::__any(predicate);
+#else
+    using namespace hip::detail;
+    const auto tidx{id(Fiber::this_fiber()) % warpSize};
+    auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
+
+    lds[tidx] = static_cast<bool>(predicate);
+
+    barrier(Tile::this_tile());
+
+    return lds.any();
+#endif
 }
 
 ROCPRIM_DEVICE inline
 int warp_all(int predicate)
 {
+#ifndef __HIP_CPU_RT__
     return ::__all(predicate);
+#else
+    using namespace hip::detail;
+    const auto tidx{id(Fiber::this_fiber()) % warpSize};
+    auto& lds{Tile::scratchpad<std::bitset<warpSize>, 1>()[0]};
+
+    lds[tidx] = static_cast<bool>(predicate);
+
+    barrier(Tile::this_tile());
+
+    return lds.all();
+#endif
 }
 
 } // end detail namespace
@@ -96,7 +127,7 @@ unsigned int MatchAny(unsigned int label)
     unsigned int retval;
 
     // Extract masks of common threads for each bit
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
     {
         unsigned long long  mask;
diff --git a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp
index 5c8734297..1caac2900 100644
--- a/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp
+++ b/rocprim/include/rocprim/intrinsics/warp_shuffle.hpp
@@ -34,6 +34,26 @@ BEGIN_ROCPRIM_NAMESPACE
 namespace detail
 {
 
+#ifdef __HIP_CPU_RT__
+// Taken from the notes of https://en.cppreference.com/w/cpp/numeric/bit_cast
+//
+// TODO: consider adding macro checks relaying to std::bit_cast when compiled
+//       using C++20.
+template <class To, class From>
+typename std::enable_if_t<
+    sizeof(To) == sizeof(From) &&
+    std::is_trivially_copyable_v<From> &&
+    std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept
+{
+    To dst;
+    std::memcpy(&dst, &src, sizeof(To));
+    return dst;
+}
+#endif
+
 template<class T, class ShuffleOp>
 ROCPRIM_DEVICE inline
 typename std::enable_if<std::is_trivially_copyable<T>::value && (sizeof(T) % sizeof(int) == 0), T>::type
@@ -42,15 +62,23 @@ warp_shuffle_op(const T& input, ShuffleOp&& op)
     constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
 
     struct V { int words[words_no]; };
+#ifdef __HIP_CPU_RT__
+    V a = bit_cast<V>(input);
+#else
     V a = __builtin_bit_cast(V, input);
+#endif
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(int i = 0; i < words_no; i++)
     {
         a.words[i] = op(a.words[i]);
     }
 
+#ifdef __HIP_CPU_RT__
+    return bit_cast<T>(a);
+#else
     return __builtin_bit_cast(T, a);
+#endif
 }
 
 template<class T, class ShuffleOp>
@@ -61,17 +89,26 @@ warp_shuffle_op(const T& input, ShuffleOp&& op)
     constexpr int words_no = (sizeof(T) + sizeof(int) - 1) / sizeof(int);
 
     T output;
-    #pragma unroll
+    ROCPRIM_UNROLL
     for(int i = 0; i < words_no; i++)
     {
         const size_t s = std::min(sizeof(int), sizeof(T) - i * sizeof(int));
         int word;
+#ifdef __HIP_CPU_RT__
+        std::memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
+#else
         __builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), s);
+#endif
         word = op(word);
+#ifdef __HIP_CPU_RT__
+        std::memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
+#else
         __builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, s);
+#endif
     }
 
     return output;
+
 }
 
 template<class T, int dpp_ctrl, int row_mask = 0xf, int bank_mask = 0xf, bool bound_ctrl = false>
@@ -82,7 +119,17 @@ T warp_move_dpp(const T& input)
         input,
         [=](int v) -> int
         {
+            // TODO: clean-up, this function activates based ROCPRIM_DETAIL_USE_DPP, however inclusion and
+            //       parsing of the template happens unconditionally. The condition causing compilation to
+            //       fail is ordinary host-compilers looking at the headers. Non-hipcc compilers don't define
+            //       __builtin_amdgcn_update_dpp, hence fail to parse the template altogether. (Except MSVC
+            //       because even using /permissive- they somehow still do delayed parsing of the body of
+            //       function templates, even though they pinky-swear they don't.)
+#if !defined(__HIP_CPU_RT__)
             return ::__builtin_amdgcn_update_dpp(0, v, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
+#else
+            return v;
+#endif
         }
     );
 }
diff --git a/rocprim/include/rocprim/iterator.hpp b/rocprim/include/rocprim/iterator.hpp
index 224f25ea0..41e359ae2 100644
--- a/rocprim/include/rocprim/iterator.hpp
+++ b/rocprim/include/rocprim/iterator.hpp
@@ -28,7 +28,9 @@
 #include "iterator/constant_iterator.hpp"
 #include "iterator/counting_iterator.hpp"
 #include "iterator/discard_iterator.hpp"
+#ifndef __HIP_CPU_RT__
 #include "iterator/texture_cache_iterator.hpp"
+#endif
 #include "iterator/transform_iterator.hpp"
 #include "iterator/zip_iterator.hpp"
 
diff --git a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp
index 8e80b7adf..d01612dc7 100644
--- a/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp
+++ b/rocprim/include/rocprim/iterator/texture_cache_iterator.hpp
@@ -208,7 +208,7 @@ class texture_cache_iterator
         #else
         texture_type words[multiple];
 
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 0; i < multiple; i++)
         {
             tex1Dfetch(
diff --git a/rocprim/include/rocprim/thread/thread_load.hpp b/rocprim/include/rocprim/thread/thread_load.hpp
index 46476ea9f..35994f999 100644
--- a/rocprim/include/rocprim/thread/thread_load.hpp
+++ b/rocprim/include/rocprim/thread/thread_load.hpp
@@ -87,6 +87,8 @@ ROCPRIM_DEVICE __forceinline__ T AsmThreadLoad(void * ptr)
     ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_cmd); \
     ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_cmd);
 
+// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile')
+#ifndef __HIP_CPU_RT__
 ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", "");
 ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", "");
 ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "vmcnt");
@@ -95,6 +97,7 @@ ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "vmcnt");
 // TODO find correct modifiers to match these
 ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", "");
 ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", "");
+#endif // __HIP_CPU_RT__
 
 #endif
 
@@ -129,7 +132,13 @@ template <
 ROCPRIM_DEVICE inline
 T thread_load(T* ptr)
 {
+#ifndef __HIP_CPU_RT__
     return detail::AsmThreadLoad<MODIFIER, T>(ptr);
+#else
+    T retval;
+    std::memcpy(&retval, ptr, sizeof(T));
+    return retval;
+#endif
 }
 
 END_ROCPRIM_NAMESPACE
diff --git a/rocprim/include/rocprim/thread/thread_reduce.hpp b/rocprim/include/rocprim/thread/thread_reduce.hpp
index 783722f9b..3ce9fdda0 100644
--- a/rocprim/include/rocprim/thread/thread_reduce.hpp
+++ b/rocprim/include/rocprim/thread/thread_reduce.hpp
@@ -60,7 +60,7 @@ ROCPRIM_DEVICE inline T thread_reduce(
     else
         retval = prefix;
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (int i = 0 + NoPrefix; i < LENGTH; ++i)
         retval = reduction_op(retval, input[i]);
 
diff --git a/rocprim/include/rocprim/thread/thread_scan.hpp b/rocprim/include/rocprim/thread/thread_scan.hpp
index 98b0ea98e..8b52f9302 100644
--- a/rocprim/include/rocprim/thread/thread_scan.hpp
+++ b/rocprim/include/rocprim/thread/thread_scan.hpp
@@ -74,7 +74,7 @@ struct Int2Type
      ScanOp              scan_op,                ///< [in] Binary scan operator
      Int2Type<LENGTH>    /*length*/)
  {
-     #pragma unroll
+     ROCPRIM_UNROLL
      for (int i = 0; i < LENGTH; ++i)
      {
          inclusive = scan_op(exclusive, input[i]);
@@ -166,7 +166,7 @@ struct Int2Type
      ScanOp              scan_op,                ///< [in] Binary scan operator
      Int2Type<LENGTH>    /*length*/)
  {
-     #pragma unroll
+     ROCPRIM_UNROLL
      for (int i = 0; i < LENGTH; ++i)
      {
          inclusive = scan_op(inclusive, input[i]);
diff --git a/rocprim/include/rocprim/thread/thread_store.hpp b/rocprim/include/rocprim/thread/thread_store.hpp
index bce00e963..236ed825a 100644
--- a/rocprim/include/rocprim/thread/thread_store.hpp
+++ b/rocprim/include/rocprim/thread/thread_store.hpp
@@ -88,6 +88,8 @@ ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, T val)
     ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \
     ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd);
 
+// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile')
+#ifndef __HIP_CPU_RT__
 ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", "");
 ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", "");
 ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "vmcnt");
@@ -95,6 +97,7 @@ ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "vmcnt");
 
 // TODO find correct modifiers to match these
 ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", "");
+#endif // __HIP_CPU_RT__
 
 #endif
 
@@ -131,7 +134,11 @@ ROCPRIM_DEVICE inline void thread_store(
     T *ptr,
     T val)
 {
+#ifndef __HIP_CPU_RT__
     detail::AsmThreadStore<MODIFIER, T>(ptr, val);
+#else
+    std::memcpy(ptr, &val, sizeof(T));
+#endif
 }
 
 END_ROCPRIM_NAMESPACE
diff --git a/rocprim/include/rocprim/types.hpp b/rocprim/include/rocprim/types.hpp
index c849a9089..59ff680d2 100644
--- a/rocprim/include/rocprim/types.hpp
+++ b/rocprim/include/rocprim/types.hpp
@@ -38,35 +38,62 @@ BEGIN_ROCPRIM_NAMESPACE
 
 namespace detail
 {
-
 // Define vector types that will be used by rocPRIM internally.
 // We don't use HIP vector types because they don't generate correct
 // load/store operations, see https://github.com/RadeonOpenCompute/ROCm/issues/341
+#ifndef _MSC_VER
 #define DEFINE_VECTOR_TYPE(name, base) \
 \
-struct name##2 \
+struct alignas(sizeof(base) * 2) name##2 \
 { \
     typedef base vector_value_type __attribute__((ext_vector_type(2))); \
     union { \
         vector_value_type data; \
         struct { base x, y; }; \
     }; \
-} __attribute__((aligned(sizeof(base) * 2))); \
+}; \
 \
-struct name##4 \
+struct alignas(sizeof(base) * 4) name##4 \
 { \
     typedef base vector_value_type __attribute__((ext_vector_type(4))); \
     union { \
         vector_value_type data; \
         struct { base x, y, w, z; }; \
     }; \
-} __attribute__((aligned(sizeof(base) * 4)));
+};
+#else
+#define DEFINE_VECTOR_TYPE(name, base) \
+\
+struct alignas(sizeof(base) * 2) name##2 \
+{ \
+    typedef base vector_value_type; \
+    union { \
+        vector_value_type data; \
+        struct { base x, y; }; \
+    }; \
+}; \
+\
+struct alignas(sizeof(base) * 4) name##4 \
+{ \
+    typedef base vector_value_type; \
+    union { \
+        vector_value_type data; \
+        struct { base x, y, w, z; }; \
+    }; \
+};
+#endif
 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4201 ) // nonstandard extension used: nameless struct/union
+#endif
 DEFINE_VECTOR_TYPE(char, char);
 DEFINE_VECTOR_TYPE(short, short);
 DEFINE_VECTOR_TYPE(int, int);
 DEFINE_VECTOR_TYPE(longlong, long long);
-
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
 // Takes a scalar type T and matches to a vector type based on NumElements.
 template <class T, unsigned int NumElements>
 struct make_vector_type
@@ -104,21 +131,36 @@ DEFINE_MAKE_VECTOR_TYPE(longlong, long long);
 
 /// \brief Empty type used as a placeholder, usually used to flag that given
 /// template parameter should not be used.
-struct empty_type
-{
+struct empty_type {};
 
+/// \brief Binary operator that takes two instances of empty_type, usually used
+/// as nop replacement for the HIP-CPU back-end
+struct empty_binary_op
+{
+    constexpr empty_type operator()(const empty_type&, const empty_type&) const { return empty_type{}; }
 };
 
 /// \brief Half-precision floating point type
 using half = ::__half;
 
 // The lane_mask_type only exist at device side
+#ifndef __AMDGCN_WAVEFRONT_SIZE
+// When not compiling with hipcc, we're compiling with HIP-CPU
+// TODO: introduce a ROCPRIM-specific macro to query this
+#define __AMDGCN_WAVEFRONT_SIZE 64
+#endif
 #if __AMDGCN_WAVEFRONT_SIZE == 32
 using lane_mask_type = unsigned int;
 #elif __AMDGCN_WAVEFRONT_SIZE == 64
 using lane_mask_type = unsigned long long int;
 #endif
 
+#ifdef __HIP_CPU_RT__
+using native_half = half;
+#else
+using native_half = _Float16;
+#endif
+
 END_ROCPRIM_NAMESPACE
 
 /// @}
diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp
index d82247179..43bcf036e 100644
--- a/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp
@@ -60,7 +60,7 @@ class warp_reduce_shared_mem
 
         output = input;
         store_volatile(&storage_.values[lid], output);
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = ceiling >> 1; i > 0; i >>= 1)
         {
             if (lid + i < WarpSize && lid < i)
@@ -85,7 +85,7 @@ class warp_reduce_shared_mem
 
         output = input;
         store_volatile(&storage_.values[lid], output);
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = ceiling >> 1; i > 0; i >>= 1)
         {
             if((lid + i) < WarpSize && lid < i && (lid + i) < valid_items)
@@ -128,7 +128,7 @@ class warp_reduce_shared_mem
         auto last = last_in_warp_segment<HeadSegmented, WarpSize>(flag);
 
         output = input;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int i = 1; i < ceiling; i *= 2)
         {
             store_volatile(&storage_.values[lid], output);
diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp
index e010c414f..6fcb42392 100644
--- a/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_reduce_shuffle.hpp
@@ -54,7 +54,7 @@ class warp_reduce_shuffle
         output = input;
 
         T value;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 1; offset < WarpSize; offset *= 2)
         {
             value = warp_shuffle_down(output, offset, WarpSize);
@@ -78,7 +78,7 @@ class warp_reduce_shuffle
         output = input;
 
         T value;
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 1; offset < WarpSize; offset *= 2)
         {
             value = warp_shuffle_down(output, offset, WarpSize);
diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp
index d52e0893b..dc1a9b929 100644
--- a/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_scan_shuffle.hpp
@@ -53,7 +53,7 @@ class warp_scan_shuffle
 
         T value;
         const unsigned int id = detail::logical_lane_id<WarpSize>();
-        #pragma unroll
+        ROCPRIM_UNROLL
         for(unsigned int offset = 1; offset < WarpSize; offset *= 2)
         {
             value = warp_shuffle_up(output, offset, WarpSize);
diff --git a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp
index 5c07c2fe3..ada1a820b 100644
--- a/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_segment_bounds.hpp
@@ -53,21 +53,24 @@ auto last_in_warp_segment(Flag flag)
     // Make sure last item in logical warp is marked as a tail
     warp_flags |= lane_mask_type(1) << (WarpSize - 1U);
     // Calculate logical lane id of the last valid value in the segment
+#ifndef __HIP_CPU_RT__
     #if __AMDGCN_WAVEFRONT_SIZE == 32
     return ::__ffs(warp_flags) - 1;
     #else
     return ::__ffsll(warp_flags) - 1;
     #endif
-}
-
-// Returns logical warp id of the last thread in thread's segment
-template<bool HeadSegmented, unsigned int WarpSize, class Flag>
-ROCPRIM_DEVICE inline
-auto last_in_warp_segment(Flag)
-    -> typename std::enable_if<(WarpSize > __AMDGCN_WAVEFRONT_SIZE), unsigned int>::type
-{
-    ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
-    return 0;
+#else
+#if _MSC_VER
+    // TODO: verify correctness
+    unsigned long tmp = 0;
+    _BitScanReverse64(&tmp, warp_flags);
+    return 1u << tmp;
+#elif __GNUC__
+    return __builtin_ctzl(warp_flags);
+#else
+    static_assert(false, "Look for GCC/Clang implementation");
+#endif
+#endif
 }
 
 } // end namespace detail
diff --git a/rocprim/include/rocprim/warp/warp_reduce.hpp b/rocprim/include/rocprim/warp/warp_reduce.hpp
index 1f33f6ae5..3bfd05485 100644
--- a/rocprim/include/rocprim/warp/warp_reduce.hpp
+++ b/rocprim/include/rocprim/warp/warp_reduce.hpp
@@ -94,7 +94,7 @@ struct select_warp_reduce_impl
 ///     // allocate storage in shared memory
 ///     __shared__ warp_reduce_int::storage_type temp[4];
 ///
-///     int logical_warp_id = hipThreadIdx_x/16;
+///     int logical_warp_id = threadIdx.x/16;
 ///     int value = ...;
 ///     // execute reduce
 ///     warp_reduce_int().reduce(
@@ -163,7 +163,7 @@ class warp_reduce
     ///     // allocate storage in shared memory
     ///     __shared__ warp_reduce_int::storage_type temp[4];
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/16;
+    ///     int logical_warp_id = threadIdx.x/16;
     ///     int value = ...;
     ///     // execute reduction
     ///     warp_reduce_int().reduce(
@@ -234,7 +234,7 @@ class warp_reduce
     ///     // allocate storage in shared memory
     ///     __shared__ warp_reduce_int::storage_type temp[4];
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/16;
+    ///     int logical_warp_id = threadIdx.x/16;
     ///     int value = ...;
     ///     int valid_items = 4;
     ///     // execute reduction
diff --git a/rocprim/include/rocprim/warp/warp_scan.hpp b/rocprim/include/rocprim/warp/warp_scan.hpp
index 22fb28eb2..684b324c9 100644
--- a/rocprim/include/rocprim/warp/warp_scan.hpp
+++ b/rocprim/include/rocprim/warp/warp_scan.hpp
@@ -92,7 +92,7 @@ struct select_warp_scan_impl
 ///     // allocate storage in shared memory
 ///     __shared__ warp_scan_int::storage_type temp[4];
 ///
-///     int logical_warp_id = hipThreadIdx_x/16;
+///     int logical_warp_id = threadIdx.x/16;
 ///     int value = ...;
 ///     // execute inclusive scan
 ///     warp_scan_int().inclusive_scan(
@@ -153,14 +153,14 @@ class warp_scan
     /// Hardware warp size is 64. Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for float and logical warp of 32 threads
     ///     using warp_scan_f = rocprim::warp_scan<float, 32>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/32;
+    ///     int logical_warp_id = threadIdx.x/32;
     ///     float value = ...;
     ///     // execute inclusive min scan
     ///     warp_scan_float().inclusive_scan(
@@ -227,14 +227,14 @@ class warp_scan
     /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for int and logical warp of 64 threads
     ///     using warp_scan_int = rocprim::warp_scan<int, 64>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/64;
+    ///     int logical_warp_id = threadIdx.x/64;
     ///     int input = ...;
     ///     int output, reduction;
     ///     // inclusive prefix sum
@@ -306,14 +306,14 @@ class warp_scan
     /// Hardware warp size is 64. Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for float and logical warp of 32 threads
     ///     using warp_scan_f = rocprim::warp_scan<float, 32>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/32;
+    ///     int logical_warp_id = threadIdx.x/32;
     ///     float value = ...;
     ///     // execute exclusive min scan
     ///     warp_scan_float().exclusive_scan(
@@ -387,14 +387,14 @@ class warp_scan
     /// each thread provides one \p int value. Hardware warp size is 64. Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for int and logical warp of 64 threads
     ///     using warp_scan_int = rocprim::warp_scan<int, 64>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/64;
+    ///     int logical_warp_id = threadIdx.x/64;
     ///     int input = ...;
     ///     int output, reduction;
     ///     // exclusive prefix sum
@@ -471,14 +471,14 @@ class warp_scan
     /// Hardware warp size is 64. Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for float and logical warp of 32 threads
     ///     using warp_scan_f = rocprim::warp_scan<float, 32>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/32;
+    ///     int logical_warp_id = threadIdx.x/32;
     ///     float input = ...;
     ///     float ex_output, in_output;
     ///     // execute exclusive min scan
@@ -561,14 +561,14 @@ class warp_scan
     /// Block (tile) size is 256.
     ///
     /// \code{.cpp}
-    /// __global__ void example_kernel(...) // hipBlockDim_x = 256
+    /// __global__ void example_kernel(...) // blockDim.x = 256
     /// {
     ///     // specialize warp_scan for int and logical warp of 64 threads
     ///     using warp_scan_int = rocprim::warp_scan<int, 64>;
     ///     // allocate storage in shared memory
     ///     __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
     ///
-    ///     int logical_warp_id = hipThreadIdx_x/64;
+    ///     int logical_warp_id = threadIdx.x/64;
     ///     int input = ...;
     ///     int in_output, ex_output, reduction;
     ///     // inclusive and exclusive prefix sum
diff --git a/rocprim/include/rocprim/warp/warp_sort.hpp b/rocprim/include/rocprim/warp/warp_sort.hpp
index 80762f472..cc8ed32df 100644
--- a/rocprim/include/rocprim/warp/warp_sort.hpp
+++ b/rocprim/include/rocprim/warp/warp_sort.hpp
@@ -64,7 +64,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// \code{.cpp}
 /// __global__ void example_kernel(...)
 /// {
-///     const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+///     const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 ///
 ///     int value = input[i];
 ///     rocprim::warp_sort<int, 64> wsort;
@@ -82,7 +82,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// ...
 /// __global__ void example_kernel(...)
 /// {
-///     const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+///     const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 ///
 ///     int value = input[i];
 ///     rocprim::warp_sort<int, 64> wsort;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 073456d2d..497ae8a6c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -32,23 +32,48 @@ function(add_hip_test TEST_NAME TEST_SOURCES)
 
   target_include_directories(${TEST_TARGET} SYSTEM BEFORE
     PUBLIC
-      ${GTEST_INCLUDE_DIRS}
       ${COMMON_TEST_HEADER_DIRECTORY}
   )
 
-  target_link_libraries(${TEST_TARGET} PRIVATE hip::device)
-
-  target_link_libraries(${TEST_TARGET}
-    PRIVATE
-      ${GTEST_BOTH_LIBRARIES}
-  )
-
-  foreach(amdgpu_target ${AMDGPU_TARGETS})
+  if(TARGET GTest::GTest)
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        GTest::GTest
+        GTest::Main
+    )
+  else()
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        GTest::gtest
+        GTest::gtest_main
+    )
+  endif()
+  if(NOT USE_HIP_CPU)
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        rocprim_hip
+    )
+  else()
     target_link_libraries(${TEST_TARGET}
       PRIVATE
-        --amdgpu-target=${amdgpu_target}
+        rocprim
+        Threads::Threads
+        hip_cpu_rt::hip_cpu_rt
     )
-  endforeach()
+    if(STL_DEPENDS_ON_TBB)
+      target_link_libraries(${TEST_TARGET}
+        PRIVATE
+          TBB::tbb
+      )
+    endif()
+  endif()
+
+  target_compile_options(${TEST_TARGET}
+    PRIVATE
+      $<$<CXX_COMPILER_ID:MSVC>:
+        /bigobj # number of sections exceeded object file format limit: compile with /bigobj
+      >
+  )
 
   set_target_properties(${TEST_TARGET}
     PROPERTIES
diff --git a/test/common_test_header.hpp b/test/common_test_header.hpp
index 3c1505c78..6f71730bc 100755
--- a/test/common_test_header.hpp
+++ b/test/common_test_header.hpp
@@ -39,7 +39,9 @@
 // HIP API
 #include <hip/hip_runtime.h>
 #include <hip/hip_vector_types.h>
+#ifndef __HIP_CPU_RT__
 #include <hip/hip_ext.h>
+#endif
 
 #ifndef HIP_CHECK
 #define HIP_CHECK(condition)         \
@@ -61,16 +63,30 @@ namespace test_common_utils
 
 int obtain_device_from_ctest()
 {
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable : 4996 ) // This function or variable may be unsafe. Consider using _dupenv_s instead.
+#endif
     static const std::string rg0 = "CTEST_RESOURCE_GROUP_0";
     if (std::getenv(rg0.c_str()) != nullptr)
     {
         std::string amdgpu_target = std::getenv(rg0.c_str());
-        std::transform(amdgpu_target.cbegin(), amdgpu_target.cend(), amdgpu_target.begin(), ::toupper);
+        std::transform(
+            amdgpu_target.cbegin(),
+            amdgpu_target.cend(),
+            amdgpu_target.begin(),
+            // Feeding std::toupper plainly results in implicitly truncating conversions between int and char triggering warnings.
+            // See: https://en.cppreference.com/mwiki/index.php?title=cpp/string/byte/toupper&oldid=94327
+            [](unsigned char c){ return static_cast<char>(std::toupper(c)); }
+        );
         std::string reqs = std::getenv((rg0 + "_" + amdgpu_target).c_str());
         return std::atoi(reqs.substr(reqs.find(':') + 1, reqs.find(',') - (reqs.find(':') + 1)).c_str());
     }
     else
         return 0;
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
 }
 
 bool use_hmm()
@@ -93,11 +109,11 @@ hipError_t hipMallocHelper(T** devPtr, size_t size)
 {
     if (use_hmm())
     {
-        return hipMallocManaged((void**)devPtr, size);
+        return hipMallocManaged(reinterpret_cast<void**>(devPtr), size);
     }
     else
     {
-        return hipMalloc((void**)devPtr, size);
+        return hipMalloc(reinterpret_cast<void**>(devPtr), size);
     }
     return hipSuccess;
 }
diff --git a/test/extra/test_rocprim_package.cpp b/test/extra/test_rocprim_package.cpp
index 911c2f658..afa9f4ae5 100644
--- a/test/extra/test_rocprim_package.cpp
+++ b/test/extra/test_rocprim_package.cpp
@@ -47,8 +47,8 @@ int main(int, char**)
     // device input/output
     T * d_input;
     T * d_output;
-    HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-    HIP_CHECK(hipMalloc(&d_output, sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), sizeof(T)));
     HIP_CHECK(
         hipMemcpy(
             d_input, input.data(),
diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt
index c9993d851..22f1f94a3 100644
--- a/test/rocprim/CMakeLists.txt
+++ b/test/rocprim/CMakeLists.txt
@@ -23,23 +23,54 @@
 function(add_rocprim_test TEST_NAME TEST_SOURCES)
   list(GET TEST_SOURCES 0 TEST_MAIN_SOURCE)
   get_filename_component(TEST_TARGET ${TEST_MAIN_SOURCE} NAME_WE)
+
   add_executable(${TEST_TARGET} ${TEST_SOURCES})
+
   target_include_directories(${TEST_TARGET} SYSTEM BEFORE
     PUBLIC
-      ${GTEST_INCLUDE_DIRS}
       ${COMMON_TEST_HEADER_DIRECTORY}
   )
-  target_link_libraries(${TEST_TARGET}
-    PRIVATE
-      rocprim_hip
-      ${GTEST_BOTH_LIBRARIES}
-  )
-  foreach(amdgpu_target ${AMDGPU_TARGETS})
+
+  if(TARGET GTest::GTest)
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        GTest::GTest
+        GTest::Main
+    )
+  else()
     target_link_libraries(${TEST_TARGET}
       PRIVATE
-        --amdgpu-target=${amdgpu_target}
+        GTest::gtest
+        GTest::gtest_main
     )
-  endforeach()
+  endif()
+  if(NOT USE_HIP_CPU)
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        rocprim_hip
+    )
+  else()
+    target_link_libraries(${TEST_TARGET}
+      PRIVATE
+        rocprim
+        Threads::Threads
+        hip_cpu_rt::hip_cpu_rt
+    )
+    if(STL_DEPENDS_ON_TBB)
+      target_link_libraries(${TEST_TARGET}
+        PRIVATE
+          TBB::tbb
+      )
+    endif()
+  endif()
+
+  target_compile_options(${TEST_TARGET}
+    PRIVATE
+      $<$<CXX_COMPILER_ID:MSVC>:
+        /bigobj # number of sections exceeded object file format limit: compile with /bigobj
+      >
+  )
+
   set_target_properties(${TEST_TARGET}
     PROPERTIES
       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/test/rocprim"
@@ -98,7 +129,9 @@ add_rocprim_test("rocprim.device_segmented_scan" test_device_segmented_scan.cpp)
 add_rocprim_test("rocprim.device_select" test_device_select.cpp)
 add_rocprim_test("rocprim.device_transform" test_device_transform.cpp)
 add_rocprim_test("rocprim.discard_iterator" test_discard_iterator.cpp)
-add_rocprim_test("rocprim.texture_cache_iterator" test_texture_cache_iterator.cpp)
+if(NOT USE_HIP_CPU)
+  add_rocprim_test("rocprim.texture_cache_iterator" test_texture_cache_iterator.cpp)
+endif()
 add_rocprim_test("rocprim.thread" test_thread.cpp)
 add_rocprim_test("rocprim.thread_algos" test_thread_algos.cpp)
 add_rocprim_test("rocprim.transform_iterator" test_transform_iterator.cpp)
diff --git a/test/rocprim/bounds_checking_iterator.hpp b/test/rocprim/bounds_checking_iterator.hpp
index 8fb4240d8..9433517e8 100644
--- a/test/rocprim/bounds_checking_iterator.hpp
+++ b/test/rocprim/bounds_checking_iterator.hpp
@@ -158,7 +158,7 @@ class out_of_bounds_flag
 public:
     out_of_bounds_flag()
     {
-        hipMalloc(&device_pointer_, sizeof(bool));
+        hipMalloc(reinterpret_cast<void**>(&device_pointer_), sizeof(bool));
         hipMemset(device_pointer_, 0, sizeof(bool));
     }
 
diff --git a/test/rocprim/detail/get_rocprim_version.cpp b/test/rocprim/detail/get_rocprim_version.cpp
index 1d2078c05..bd3a11c27 100644
--- a/test/rocprim/detail/get_rocprim_version.cpp
+++ b/test/rocprim/detail/get_rocprim_version.cpp
@@ -34,7 +34,7 @@ unsigned int get_rocprim_version_on_device()
     unsigned int version = 0;
 
     unsigned int * d_version;
-    HIP_CHECK(hipMalloc(&d_version, sizeof(unsigned int)));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_version), sizeof(unsigned int)));
     HIP_CHECK(hipDeviceSynchronize());
 
     hipLaunchKernelGGL(
diff --git a/test/rocprim/test_block_adjacent_difference.cpp b/test/rocprim/test_block_adjacent_difference.cpp
index e004442e5..2f0564d58 100644
--- a/test/rocprim/test_block_adjacent_difference.cpp
+++ b/test/rocprim/test_block_adjacent_difference.cpp
@@ -111,9 +111,9 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void flag_heads_kernel(Type* device_input, long long* device_heads)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -121,7 +121,7 @@ void flag_heads_kernel(Type* device_input, long long* device_heads)
     rocprim::block_adjacent_difference<Type, BlockSize> bdiscontinuity;
 
     FlagType head_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 2 == 1)
+    if(blockIdx.x % 2 == 1)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         bdiscontinuity.flag_heads(head_flags, tile_predecessor_item, input, FlagOpType());
@@ -145,9 +145,9 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void flag_tails_kernel(Type* device_input, long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -155,7 +155,7 @@ void flag_tails_kernel(Type* device_input, long long* device_tails)
     rocprim::block_adjacent_difference<Type, BlockSize> bdiscontinuity;
 
     FlagType tail_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 2 == 0)
+    if(blockIdx.x % 2 == 0)
     {
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_tails(tail_flags, tile_successor_item, input, FlagOpType());
@@ -179,9 +179,9 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -190,23 +190,23 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo
 
     FlagType head_flags[ItemsPerThread];
     FlagType tail_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 4 == 0)
+    if(blockIdx.x % 4 == 0)
     {
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 1)
+    else if(blockIdx.x % 4 == 1)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 2)
+    else if(blockIdx.x % 4 == 2)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 3)
+    else if(blockIdx.x % 4 == 3)
     {
         bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, FlagOpType());
     }
@@ -236,11 +236,11 @@ auto test_block_adjacent_difference()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t size = items_per_block * 20;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -280,9 +280,9 @@ auto test_block_adjacent_difference()
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_heads;
-        HIP_CHECK(hipMalloc(&device_heads, heads.size() * sizeof(typename decltype(heads)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_heads), heads.size() * sizeof(typename decltype(heads)::value_type)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -348,11 +348,11 @@ auto test_block_adjacent_difference()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t size = items_per_block * 20;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -392,9 +392,9 @@ auto test_block_adjacent_difference()
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_tails;
-        HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_tails), tails.size() * sizeof(typename decltype(tails)::value_type)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -460,11 +460,11 @@ auto test_block_adjacent_difference()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
-    const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t size = items_per_block * 20;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -516,11 +516,11 @@ auto test_block_adjacent_difference()
 
         // Preparing Device
         type* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(typename decltype(input)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(typename decltype(input)::value_type)));
         long long* device_heads;
-        HIP_CHECK(hipMalloc(&device_heads, tails.size() * sizeof(typename decltype(heads)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_heads), tails.size() * sizeof(typename decltype(heads)::value_type)));
         long long* device_tails;
-        HIP_CHECK(hipMalloc(&device_tails, tails.size() * sizeof(typename decltype(tails)::value_type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_tails), tails.size() * sizeof(typename decltype(tails)::value_type)));
 
         HIP_CHECK(
             hipMemcpy(
diff --git a/test/rocprim/test_block_discontinuity.cpp b/test/rocprim/test_block_discontinuity.cpp
index 825d1dde6..ef9ab915d 100644
--- a/test/rocprim/test_block_discontinuity.cpp
+++ b/test/rocprim/test_block_discontinuity.cpp
@@ -111,9 +111,9 @@ __global__
 __launch_bounds__(BlockSize)
 void flag_heads_kernel(Type* device_input, long long* device_heads)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -121,7 +121,7 @@ void flag_heads_kernel(Type* device_input, long long* device_heads)
     rocprim::block_discontinuity<Type, BlockSize> bdiscontinuity;
 
     FlagType head_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 2 == 1)
+    if(blockIdx.x % 2 == 1)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         bdiscontinuity.flag_heads(head_flags, tile_predecessor_item, input, FlagOpType());
@@ -145,9 +145,9 @@ __global__
 __launch_bounds__(BlockSize)
 void flag_tails_kernel(Type* device_input, long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -155,7 +155,7 @@ void flag_tails_kernel(Type* device_input, long long* device_tails)
     rocprim::block_discontinuity<Type, BlockSize> bdiscontinuity;
 
     FlagType tail_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 2 == 0)
+    if(blockIdx.x % 2 == 0)
     {
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_tails(tail_flags, tile_successor_item, input, FlagOpType());
@@ -179,9 +179,9 @@ __global__
 __launch_bounds__(BlockSize)
 void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, long long* device_tails)
 {
-    const unsigned int lid = hipThreadIdx_x;
+    const unsigned int lid = threadIdx.x;
     const unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     Type input[ItemsPerThread];
     rocprim::block_load_direct_blocked(lid, device_input + block_offset, input);
@@ -190,23 +190,23 @@ void flag_heads_and_tails_kernel(Type* device_input, long long* device_heads, lo
 
     FlagType head_flags[ItemsPerThread];
     FlagType tail_flags[ItemsPerThread];
-    if(hipBlockIdx_x % 4 == 0)
+    if(blockIdx.x % 4 == 0)
     {
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, tile_successor_item, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 1)
+    else if(blockIdx.x % 4 == 1)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         const Type tile_successor_item = device_input[block_offset + items_per_block];
         bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 2)
+    else if(blockIdx.x % 4 == 2)
     {
         const Type tile_predecessor_item = device_input[block_offset - 1];
         bdiscontinuity.flag_heads_and_tails(head_flags, tile_predecessor_item, tail_flags, input, FlagOpType());
     }
-    else if(hipBlockIdx_x % 4 == 3)
+    else if(blockIdx.x % 4 == 3)
     {
         bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, FlagOpType());
     }
@@ -236,11 +236,11 @@ auto test_block_discontinuity()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -348,11 +348,11 @@ auto test_block_discontinuity()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -460,11 +460,11 @@ auto test_block_discontinuity()
                            >::type;
     using flag_type = FlagType;
     using flag_op_type = FlagOpType;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 20;
-    constexpr size_t grid_size = size / items_per_block;
+    static constexpr size_t grid_size = size / items_per_block;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
diff --git a/test/rocprim/test_block_exchange.cpp b/test/rocprim/test_block_exchange.cpp
index b8f1cbec2..867dc59d1 100644
--- a/test/rocprim/test_block_exchange.cpp
+++ b/test/rocprim/test_block_exchange.cpp
@@ -49,8 +49,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void blocked_to_striped_kernel(Type* device_input, OutputType* device_output)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -73,8 +73,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void striped_to_blocked_kernel(Type* device_input, OutputType* device_output)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -97,8 +97,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void blocked_to_warp_striped_kernel(Type* device_input, OutputType* device_output)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -121,8 +121,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void warp_striped_to_blocked_kernel(Type* device_input, OutputType* device_output)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -145,8 +145,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void scatter_to_blocked_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -171,8 +171,8 @@ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, unsigned int* device_ranks)
 {
     constexpr unsigned int block_size = (ItemsPerBlock / ItemsPerThread);
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * ItemsPerBlock;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * ItemsPerBlock;
 
     Type input[ItemsPerThread];
     OutputType output[ItemsPerThread];
@@ -199,9 +199,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
@@ -212,7 +212,7 @@ auto test_block_exchange()
     // Generate data
     std::vector<type> input(size);
     std::vector<output_type> expected(size);
-    std::vector<output_type> output(size, 0);
+    std::vector<output_type> output(size, (output_type)0);
 
     // Calculate input and expected results on host
     std::vector<type> values(size);
@@ -227,7 +227,7 @@ auto test_block_exchange()
                 const size_t i0 = offset + ti * items_per_thread + ii;
                 const size_t i1 = offset + ii * block_size + ti;
                 input[i1] = values[i1];
-                expected[i0] = values[i1];
+                expected[i0] = static_cast<output_type>(values[i1]);
             }
         }
     }
@@ -283,9 +283,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
@@ -367,9 +367,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
@@ -463,9 +463,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
@@ -557,9 +557,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
@@ -659,9 +659,9 @@ auto test_block_exchange()
 {
     using type = T;
     using output_type = U;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
     {
diff --git a/test/rocprim/test_block_histogram.cpp b/test/rocprim/test_block_histogram.cpp
index 1797be605..431a83c2a 100644
--- a/test/rocprim/test_block_histogram.cpp
+++ b/test/rocprim/test_block_histogram.cpp
@@ -84,8 +84,8 @@ __global__
 __launch_bounds__(BlockSize)
 void histogram_kernel(T* device_output, T* device_output_bin)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
-    unsigned int global_offset = hipBlockIdx_x * BinSize;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
+    unsigned int global_offset = blockIdx.x * BinSize;
     __shared__ BinType hist[BinSize];
     // load
     T in_out[ItemsPerThread];
@@ -97,12 +97,12 @@ void histogram_kernel(T* device_output, T* device_output_bin)
     rocprim::block_histogram<T, BlockSize, ItemsPerThread, BinSize, Algorithm> bhist;
     bhist.histogram(in_out, hist);
 
-    #pragma unroll
+    ROCPRIM_UNROLL
     for (unsigned int offset = 0; offset < BinSize; offset += BlockSize)
     {
-        if(offset + hipThreadIdx_x < BinSize)
+        if(offset + threadIdx.x < BinSize)
         {
-            device_output_bin[global_offset + hipThreadIdx_x] = hist[offset + hipThreadIdx_x];
+            device_output_bin[global_offset + threadIdx.x] = hist[offset + threadIdx.x];
             global_offset += BlockSize;
         }
     }
@@ -118,10 +118,10 @@ template<
 >
 void test_block_histogram_input_arrays()
 {
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr size_t bin = BlockSize;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr size_t bin = BlockSize;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
diff --git a/test/rocprim/test_block_load_store.cpp b/test/rocprim/test_block_load_store.cpp
index 6ae326a54..9b48ffa5f 100644
--- a/test/rocprim/test_block_load_store.cpp
+++ b/test/rocprim/test_block_load_store.cpp
@@ -248,12 +248,12 @@ __global__
 __launch_bounds__(BlockSize)
 void load_store_kernel(Type* device_input, Type* device_output)
 {
-    Type items[ItemsPerThread];
-    unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
     rocprim::block_load<Type, BlockSize, ItemsPerThread, LoadMethod> load;
     rocprim::block_store<Type, BlockSize, ItemsPerThread, StoreMethod> store;
-    load.load(device_input + offset, items);
-    store.store(device_output + offset, items);
+    load.load(device_input + offset, _items);
+    store.store(device_output + offset, _items);
 }
 
 TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass)
@@ -263,11 +263,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass)
     HIP_CHECK(hipSetDevice(device_id));
 
     using Type = typename TestFixture::params::type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
-    constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
-    const size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 113;
     const auto grid_size = size / items_per_block;
     // Given block size not supported
@@ -283,10 +283,10 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClass)
 
         // Generate data
         std::vector<Type> input = test_utils::get_random_data<Type>(size, -100, 100, seed_value);
-        std::vector<Type> output(input.size(), 0);
+        std::vector<Type> output(input.size(), (Type)0);
 
         // Calculate expected results on host
-        std::vector<Type> expected(input.size(), 0);
+        std::vector<Type> expected(input.size(), (Type)0);
         for (size_t i = 0; i < 113; i++)
         {
             size_t block_offset = i * items_per_block;
@@ -351,12 +351,12 @@ __global__
 __launch_bounds__(BlockSize)
 void load_store_valid_kernel(Type* device_input, Type* device_output, size_t valid)
 {
-    Type items[ItemsPerThread];
-    unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
     rocprim::block_load<Type, BlockSize, ItemsPerThread, LoadMethod> load;
     rocprim::block_store<Type, BlockSize, ItemsPerThread, StoreMethod> store;
-    load.load(device_input + offset, items, valid);
-    store.store(device_output + offset, items, valid);
+    load.load(device_input + offset, _items, (unsigned int)valid);
+    store.store(device_output + offset, _items, (unsigned int)valid);
 }
 
 TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid)
@@ -366,11 +366,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid)
     HIP_CHECK(hipSetDevice(device_id));
 
     using Type = typename TestFixture::params::type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
-    constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
-    const size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 113;
     const auto grid_size = size / items_per_block;
     // Given block size not supported
@@ -388,10 +388,10 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassValid)
 
         // Generate data
         std::vector<Type> input = test_utils::get_random_data<Type>(size, -100, 100, seed_value);
-        std::vector<Type> output(input.size(), 0);
+        std::vector<Type> output(input.size(), (Type)0);
 
         // Calculate expected results on host
-        std::vector<Type> expected(input.size(), 0);
+        std::vector<Type> expected(input.size(), (Type)0);
         for (size_t i = 0; i < 113; i++)
         {
             size_t block_offset = i * items_per_block;
@@ -462,18 +462,19 @@ template<
     rocprim::block_load_method LoadMethod,
     rocprim::block_store_method StoreMethod,
     unsigned int BlockSize,
-    unsigned int ItemsPerThread
+    unsigned int ItemsPerThread,
+    class Def
 >
 __global__
 __launch_bounds__(BlockSize)
-void load_store_valid_default_kernel(Type* device_input, Type* device_output, size_t valid, int _default)
+void load_store_valid_default_kernel(Type* device_input, Type* device_output, size_t valid, Def _default)
 {
-    Type items[ItemsPerThread];
-    unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
+    Type _items[ItemsPerThread];
+    auto offset = blockIdx.x * BlockSize * ItemsPerThread;
     rocprim::block_load<Type, BlockSize, ItemsPerThread, LoadMethod> load;
     rocprim::block_store<Type, BlockSize, ItemsPerThread, StoreMethod> store;
-    load.load(device_input + offset, items, valid, _default);
-    store.store(device_output + offset, items);
+    load.load(device_input + offset, _items, (unsigned int)valid, _default);
+    store.store(device_output + offset, _items);
 }
 
 TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault)
@@ -483,11 +484,11 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault)
     HIP_CHECK(hipSetDevice(device_id));
 
     using Type = typename TestFixture::params::type;
-    constexpr size_t block_size = TestFixture::params::block_size;
-    constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
-    constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
-    const size_t items_per_thread = TestFixture::params::items_per_thread;
-    constexpr auto items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = TestFixture::params::block_size;
+    static constexpr rocprim::block_load_method load_method = TestFixture::params::load_method;
+    static constexpr rocprim::block_store_method store_method = TestFixture::params::store_method;
+    static constexpr size_t items_per_thread = TestFixture::params::items_per_thread;
+    static constexpr auto items_per_block = block_size * items_per_thread;
     const size_t size = items_per_block * 113;
     const auto grid_size = size / items_per_block;
     // Given block size not supported
@@ -497,7 +498,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault)
     }
 
     const size_t valid = items_per_thread + 1;
-    int _default = -1;
+    Type _default = (Type)-1;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -506,7 +507,7 @@ TYPED_TEST(RocprimBlockLoadStoreClassTests, LoadStoreClassDefault)
 
         // Generate data
         std::vector<Type> input = test_utils::get_random_data<Type>(size, -100, 100, seed_value);
-        std::vector<Type> output(input.size(), 0);
+        std::vector<Type> output(input.size(), (Type)0);
 
         // Calculate expected results on host
         std::vector<Type> expected(input.size(), _default);
diff --git a/test/rocprim/test_block_radix_sort.cpp b/test/rocprim/test_block_radix_sort.cpp
index a79f3a52c..507d898f3 100644
--- a/test/rocprim/test_block_radix_sort.cpp
+++ b/test/rocprim/test_block_radix_sort.cpp
@@ -30,6 +30,7 @@
 
 // required test headers
 #include "test_utils_types.hpp"
+#include "test_sort_comparator.hpp"
 
 template<class Params>
 class RocprimBlockRadixSort : public ::testing::Test {
@@ -56,48 +57,6 @@ static constexpr unsigned int end_radix[n_sizes] = {
 
 TYPED_TEST_SUITE(RocprimBlockRadixSort, BlockParams);
 
-template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_comparator
-{
-    static_assert(rocprim::is_unsigned<Key>::value, "Test supports start and end bits only for unsigned integers");
-
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        auto mask = (1ull << (EndBit - StartBit)) - 1;
-        auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
-        auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
-        return Descending ? (r < l) : (l < r);
-    }
-};
-
-template<class Key, bool Descending>
-struct key_comparator<Key, Descending, 0, sizeof(Key) * 8>
-{
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        return Descending ? (rhs < lhs) : (lhs < rhs);
-    }
-};
-
-template<bool Descending>
-struct key_comparator<rocprim::half, Descending, 0, sizeof(rocprim::half) * 8>
-{
-    bool operator()(const rocprim::half& lhs, const rocprim::half& rhs)
-    {
-        // HIP's half doesn't have __host__ comparison operators, use floats instead
-        return key_comparator<float, Descending, 0, sizeof(float) * 8>()(lhs, rhs);
-    }
-};
-
-template<class Key, class Value, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_value_comparator
-{
-    bool operator()(const std::pair<Key, Value>& lhs, const std::pair<Key, Value>& rhs)
-    {
-        return key_comparator<Key, Descending, StartBit, EndBit>()(lhs.first, rhs.first);
-    }
-};
-
 template<
     unsigned int BlockSize,
     unsigned int ItemsPerThread,
@@ -113,10 +72,15 @@ void sort_key_kernel(
     unsigned int end_bit)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     key_type keys[ItemsPerThread];
+#ifdef __HIP_CPU_RT__
+    // TODO: check if it's really neccessary
+    // Initialize contents, as non-hipcc compilers don't unconditionally zero out allocated memory
+    std::memset(keys, 0, ItemsPerThread * sizeof(key_type));
+#endif
     rocprim::block_load_direct_blocked(lid, device_keys_output + block_offset, keys);
 
     rocprim::block_radix_sort<key_type, BlockSize, ItemsPerThread> bsort;
@@ -158,8 +122,8 @@ void sort_key_value_kernel(
     unsigned int end_bit)
 {
     constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
-    const unsigned int lid = hipThreadIdx_x;
-    const unsigned int block_offset = hipBlockIdx_x * items_per_block;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int block_offset = blockIdx.x * items_per_block;
 
     key_type keys[ItemsPerThread];
     value_type values[ItemsPerThread];
@@ -205,13 +169,13 @@ auto test_block_radix_sort()
 -> typename std::enable_if<Method == 0>::type
 {
     using key_type = Key;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr bool descending = Descending;
-    constexpr bool to_striped = ToStriped;
-    constexpr unsigned int start_bit = (rocprim::is_unsigned<Key>::value == false) ? 0 : StartBit;
-    constexpr unsigned int end_bit = (rocprim::is_unsigned<Key>::value == false) ? sizeof(Key) * 8 : EndBit;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr bool descending = Descending;
+    static constexpr bool to_striped = ToStriped;
+    static constexpr unsigned int start_bit = (rocprim::is_unsigned<Key>::value == false) ? 0 : StartBit;
+    static constexpr unsigned int end_bit = (rocprim::is_unsigned<Key>::value == false) ? sizeof(Key) * 8 : EndBit;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -231,7 +195,7 @@ auto test_block_radix_sort()
         std::vector<key_type> keys_output;
         if(rocprim::is_floating_point<key_type>::value)
         {
-            keys_output = test_utils::get_random_data<key_type>(size, (key_type)-1000, (key_type)+1000, seed_value);
+            keys_output = test_utils::get_random_data<key_type>(size, -100, +100, seed_value);
         }
         else
         {
@@ -306,13 +270,13 @@ auto test_block_radix_sort()
 {
     using key_type = Key;
     using value_type = Value;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
-    constexpr bool descending = Descending;
-    constexpr bool to_striped = ToStriped;
-    constexpr unsigned int start_bit = (rocprim::is_unsigned<Key>::value == false) ? 0 : StartBit;
-    constexpr unsigned int end_bit = (rocprim::is_unsigned<Key>::value == false) ? sizeof(Key) * 8 : EndBit;
-    constexpr size_t items_per_block = block_size * items_per_thread;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr bool descending = Descending;
+    static constexpr bool to_striped = ToStriped;
+    static constexpr unsigned int start_bit = (rocprim::is_unsigned<Key>::value == false) ? 0 : StartBit;
+    static constexpr unsigned int end_bit = (rocprim::is_unsigned<Key>::value == false) ? sizeof(Key) * 8 : EndBit;
+    static constexpr size_t items_per_block = block_size * items_per_thread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -325,14 +289,14 @@ auto test_block_radix_sort()
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        seed_type seed_value = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
         std::vector<key_type> keys_output;
         if(rocprim::is_floating_point<key_type>::value)
         {
-            keys_output = test_utils::get_random_data<key_type>(size, (key_type)-1000, (key_type)+1000, seed_value);
+            keys_output = test_utils::get_random_data<key_type>(size, -100, +100, seed_value);
         }
         else
         {
@@ -340,7 +304,7 @@ auto test_block_radix_sort()
                 size,
                 std::numeric_limits<key_type>::min(),
                 std::numeric_limits<key_type>::max(),
-                seed_index
+                seed_value
             );
         }
 
diff --git a/test/rocprim/test_block_reduce.cpp b/test/rocprim/test_block_reduce.cpp
index 5ef032d3a..911e68b27 100644
--- a/test/rocprim/test_block_reduce.cpp
+++ b/test/rocprim/test_block_reduce.cpp
@@ -53,13 +53,13 @@ __global__
 __launch_bounds__(BlockSize)
 void reduce_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     rocprim::block_reduce<T, BlockSize, Algorithm> breduce;
     breduce.reduce(value, value, BinaryOp());
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_reductions[hipBlockIdx_x] = value;
+        device_output_reductions[blockIdx.x] = value;
     }
 }
 
@@ -140,15 +140,15 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, Reduce)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> output = test_utils::get_random_data<T>(size, 2, 50, seed_value);
+        std::vector<T> output = test_utils::get_random_data<T>(size, (T)2, (T)50, seed_value);
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < block_size; j++)
             {
                 auto idx = i * block_size + j;
@@ -207,7 +207,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> output(size, 1);
+        std::vector<T> output(size, (T)1);
         auto two_places = test_utils::get_random_data<unsigned int>(size/32, 0, size-1, seed_value);
         for(auto i : two_places)
         {
@@ -216,11 +216,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceMultiplies)
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
-            T value = 1;
+            T value = (T)1;
             for(size_t j = 0; j < block_size; j++)
             {
                 auto idx = i * block_size + j;
@@ -265,13 +265,13 @@ __global__
 __launch_bounds__(BlockSize)
 void reduce_valid_kernel(T* device_output, T* device_output_reductions, const unsigned int valid_items)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     rocprim::block_reduce<T, BlockSize, Algorithm> breduce;
     breduce.reduce(value, value, valid_items, BinaryOp());
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_reductions[hipBlockIdx_x] = value;
+        device_output_reductions[blockIdx.x] = value;
     }
 }
 
@@ -335,7 +335,7 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid)
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
-        const unsigned int valid_items = test_utils::get_random_value(block_size - 10, block_size, seed_value);
+        const size_t valid_items = test_utils::get_random_value<size_t>(block_size - 10, block_size, seed_value);
 
         // Given block size not supported
         if(block_size > test_utils::get_max_block_size())
@@ -350,11 +350,11 @@ TYPED_TEST(RocprimBlockReduceSingleValueTests, ReduceValid)
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
-            T value = 0;
+            T value = static_cast<T>(0);
             for(size_t j = 0; j < valid_items; j++)
             {
                 auto idx = i * block_size + j;
@@ -410,7 +410,7 @@ __global__
 __launch_bounds__(BlockSize)
 void reduce_array_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
     // load
     T in_out[ItemsPerThread];
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -422,9 +422,9 @@ void reduce_array_kernel(T* device_output, T* device_output_reductions)
     T reduction;
     breduce.reduce(in_out, reduction, BinaryOp());
 
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_reductions[hipBlockIdx_x] = reduction;
+        device_output_reductions[blockIdx.x] = reduction;
     }
 }
 
@@ -438,9 +438,9 @@ template<
 void test_block_reduce_input_arrays()
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -461,14 +461,14 @@ void test_block_reduce_input_arrays()
         std::vector<T> output = test_utils::get_random_data<T>(size, 0, 100, seed_value);
 
         // Output reduce results
-        std::vector<T> output_reductions(size / block_size, 0);
+        std::vector<T> output_reductions(size / block_size, (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < items_per_block; j++)
             {
                 auto idx = i * items_per_block + j;
diff --git a/test/rocprim/test_block_scan.cpp b/test/rocprim/test_block_scan.cpp
index 6c3510ad1..4becee3c7 100644
--- a/test/rocprim/test_block_scan.cpp
+++ b/test/rocprim/test_block_scan.cpp
@@ -58,7 +58,7 @@ void scan_kernel(T* device_output, T* device_output_b, T init)
 {
     (void)init;
     (void)device_output_b;
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     rocprim::block_scan<T, BlockSize, Algorithm> bscan;
     bscan.inclusive_scan(value, value);
@@ -77,15 +77,15 @@ __launch_bounds__(BlockSize)
 void scan_kernel(T* device_output, T* device_output_b, T init)
 {
     (void)init;
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     T reduction;
     rocprim::block_scan<T, BlockSize, Algorithm> bscan;
     bscan.inclusive_scan(value, value, reduction);
     device_output[index] = value;
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_b[hipBlockIdx_x] = reduction;
+        device_output_b[blockIdx.x] = reduction;
     }
 }
 
@@ -100,7 +100,7 @@ __global__
 __launch_bounds__(BlockSize)
 void scan_kernel(T* device_output, T* device_output_b, T init)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T prefix_value = init;
     auto prefix_callback = [&prefix_value](T reduction)
     {
@@ -116,9 +116,9 @@ void scan_kernel(T* device_output, T* device_output_b, T init)
     bscan_t().inclusive_scan(value, value, storage, prefix_callback, rocprim::plus<T>());
 
     device_output[index] = value;
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_b[hipBlockIdx_x] = prefix_value;
+        device_output_b[blockIdx.x] = prefix_value;
     }
 }
 
@@ -134,7 +134,7 @@ __launch_bounds__(BlockSize)
 void scan_kernel(T* device_output, T* device_output_b, T init)
 {
     (void)device_output_b;
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     rocprim::block_scan<T, BlockSize, Algorithm> bscan;
     bscan.exclusive_scan(value, value, init);
@@ -152,15 +152,15 @@ __global__
 __launch_bounds__(BlockSize)
 void scan_kernel(T* device_output, T* device_output_b, T init)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T value = device_output[index];
     T reduction;
     rocprim::block_scan<T, BlockSize, Algorithm> bscan;
     bscan.exclusive_scan(value, value, init, reduction);
     device_output[index] = value;
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_b[hipBlockIdx_x] = reduction;
+        device_output_b[blockIdx.x] = reduction;
     }
 }
 
@@ -175,7 +175,7 @@ __global__
 __launch_bounds__(BlockSize)
 void scan_kernel(T* device_output, T* device_output_b, T init)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     T prefix_value = init;
     auto prefix_callback = [&prefix_value](T reduction)
     {
@@ -191,9 +191,9 @@ void scan_kernel(T* device_output, T* device_output_b, T init)
     bscan_t().exclusive_scan(value, value, storage, prefix_callback, rocprim::plus<T>());
 
     device_output[index] = value;
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_b[hipBlockIdx_x] = prefix_value;
+        device_output_b[blockIdx.x] = prefix_value;
     }
 }
 
@@ -287,7 +287,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScan)
         std::vector<T> output2 = output;
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -347,8 +347,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanReduce)
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -418,8 +418,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, InclusiveScanPrefixCallback)
         T block_prefix = test_utils::get_random_value<T>(0, 5, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -489,7 +489,7 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScan)
         const T init = test_utils::get_random_value<T>(0, 5, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -552,8 +552,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanReduce)
         std::vector<T> output_reductions(size / block_size);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -630,8 +630,8 @@ TYPED_TEST(RocprimBlockScanSingleValueTests, ExclusiveScanPrefixCallback)
         T block_prefix = test_utils::get_random_value<T>(0, 5, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / block_size; i++)
         {
@@ -702,7 +702,7 @@ __global__
 __launch_bounds__(BlockSize)
 void inclusive_scan_array_kernel(T* device_output)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread;
 
     // load
     T in_out[ItemsPerThread];
@@ -732,7 +732,7 @@ __global__
 __launch_bounds__(BlockSize)
 void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize ) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize ) + threadIdx.x) * ItemsPerThread;
 
     // load
     T in_out[ItemsPerThread];
@@ -751,9 +751,9 @@ void inclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc
         device_output[index + j] = in_out[j];
     }
 
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_reductions[hipBlockIdx_x] = reduction;
+        device_output_reductions[blockIdx.x] = reduction;
     }
 }
 
@@ -768,7 +768,7 @@ __global__
 __launch_bounds__(BlockSize)
 void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_output_bp, T block_prefix)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
     T prefix_value = block_prefix;
     auto prefix_callback = [&prefix_value](T reduction)
     {
@@ -794,9 +794,9 @@ void inclusive_scan_array_prefix_callback_kernel(T* device_output, T* device_out
         device_output[index + j] = in_out[j];
     }
 
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_bp[hipBlockIdx_x] = prefix_value;
+        device_output_bp[blockIdx.x] = prefix_value;
     }
 }
 
@@ -811,7 +811,7 @@ __global__
 __launch_bounds__(BlockSize)
 void exclusive_scan_array_kernel(T* device_output, T init)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
     // load
     T in_out[ItemsPerThread];
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -840,7 +840,7 @@ __global__
 __launch_bounds__(BlockSize)
 void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reductions, T init)
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
     // load
     T in_out[ItemsPerThread];
     for(unsigned int j = 0; j < ItemsPerThread; j++)
@@ -858,9 +858,9 @@ void exclusive_scan_reduce_array_kernel(T* device_output, T* device_output_reduc
         device_output[index + j] = in_out[j];
     }
 
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_reductions[hipBlockIdx_x] = reduction;
+        device_output_reductions[blockIdx.x] = reduction;
     }
 }
 
@@ -879,7 +879,7 @@ void exclusive_scan_prefix_callback_array_kernel(
     T block_prefix
 )
 {
-    const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread;
+    const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread;
     T prefix_value = block_prefix;
     auto prefix_callback = [&prefix_value](T reduction)
     {
@@ -905,9 +905,9 @@ void exclusive_scan_prefix_callback_array_kernel(
         device_output[index + j] = in_out[j];
     }
 
-    if(hipThreadIdx_x == 0)
+    if(threadIdx.x == 0)
     {
-        device_output_bp[hipBlockIdx_x] = prefix_value;
+        device_output_bp[blockIdx.x] = prefix_value;
     }
 }
 
@@ -923,9 +923,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 0>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -946,7 +946,7 @@ auto test_block_scan_input_arrays()
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 100, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
@@ -1006,9 +1006,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 1>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1029,11 +1029,11 @@ auto test_block_scan_input_arrays()
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 100, seed_value);
 
         // Output reduce results
-        std::vector<T> output_reductions(size / block_size, 0);
+        std::vector<T> output_reductions(size / block_size, (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
@@ -1120,9 +1120,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 2>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1141,12 +1141,12 @@ auto test_block_scan_input_arrays()
 
         // Generate data
         std::vector<T> output = test_utils::get_random_data<T>(size, 2, 100, seed_value);
-        std::vector<T> output_block_prefixes(size / items_per_block, 0);
+        std::vector<T> output_block_prefixes(size / items_per_block, (T)0);
         T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
@@ -1236,9 +1236,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 3>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1260,7 +1260,7 @@ auto test_block_scan_input_arrays()
         const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
@@ -1322,9 +1322,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 4>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1349,8 +1349,8 @@ auto test_block_scan_input_arrays()
         const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
@@ -1433,9 +1433,9 @@ auto test_block_scan_input_arrays()
 -> typename std::enable_if<Method == 5>::type
 {
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_maximum, rocprim::maximum<T>>::type;
-    constexpr auto algorithm = Algorithm;
-    constexpr size_t block_size = BlockSize;
-    constexpr size_t items_per_thread = ItemsPerThread;
+    static constexpr auto algorithm = Algorithm;
+    static constexpr size_t block_size = BlockSize;
+    static constexpr size_t items_per_thread = ItemsPerThread;
 
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size())
@@ -1458,8 +1458,8 @@ auto test_block_scan_input_arrays()
         T block_prefix = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_block_prefixes(output_block_prefixes.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / items_per_block; i++)
         {
diff --git a/test/rocprim/test_block_shuffle.cpp b/test/rocprim/test_block_shuffle.cpp
index 55edd615b..bd8c2ddf6 100644
--- a/test/rocprim/test_block_shuffle.cpp
+++ b/test/rocprim/test_block_shuffle.cpp
@@ -48,7 +48,7 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void shuffle_offset_kernel(T* device_input, T* device_output, int distance)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     rocprim::block_shuffle<T,BlockSize> b_shuffle;
     b_shuffle.offset(device_input[index],device_output[index],distance);
 }
@@ -60,13 +60,13 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockOffset)
     HIP_CHECK(hipSetDevice(device_id));
 
     using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 11;
-    const size_t grid_size = size / block_size;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 11;
+    static constexpr size_t grid_size = size / block_size;
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
-        int distance = (rand()%min(10,block_size/2))-min(10,block_size/2);
+        int distance = (rand()%std::min<size_t>(10,block_size/2))-std::min<size_t>(10,block_size/2);
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "<<distance);
         // Generate data
         std::vector<type> input_data = test_utils::get_random_data<type>(size, -100, 100, seed_value);
@@ -76,8 +76,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockOffset)
         type * device_input;
         type * device_output;
 
-        HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
-        HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), input_data.size() * sizeof(type)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -132,7 +132,7 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void shuffle_rotate_kernel(T* device_input, T* device_output, int distance)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     rocprim::block_shuffle<T,BlockSize> b_shuffle;
     b_shuffle.rotate(device_input[index],device_output[index],distance);
 }
@@ -144,13 +144,13 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockRotate)
     HIP_CHECK(hipSetDevice(device_id));
 
     using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 11;
-    const size_t grid_size = size / block_size;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 11;
+    static constexpr size_t grid_size = size / block_size;
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
-        int distance = (rand()%min(5,block_size/2));
+        int distance = (rand()%std::min<size_t>(5,block_size/2));
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value <<" & distance = "<<distance);
         // Generate data
         std::vector<type> input_data = test_utils::get_random_data<type>(size, -100, 100, seed_value);
@@ -160,8 +160,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockRotate)
         type * device_input;
         type * device_output;
 
-        HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
-        HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), input_data.size() * sizeof(type)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -216,7 +216,7 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void shuffle_up_kernel(T (*device_input), T (*device_output))
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     rocprim::block_shuffle<T,BlockSize> b_shuffle;
     b_shuffle.template up<ItemsPerThread>(reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index*ItemsPerThread]),reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index*ItemsPerThread]));
 }
@@ -228,10 +228,10 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockUp)
     HIP_CHECK(hipSetDevice(device_id));
 
     using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 11;
-    const size_t grid_size = size / block_size;
-    constexpr unsigned int ItemsPerThread = 128;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 11;
+    static constexpr size_t grid_size = size / block_size;
+    static constexpr unsigned int ItemsPerThread = 128;
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
@@ -248,8 +248,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockUp)
         type * device_output;
 
 
-        HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
-        HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), input_data.size() * sizeof(type)));
 
 
 
@@ -310,7 +310,7 @@ __global__
 __launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
 void shuffle_down_kernel(T (*device_input), T (*device_output))
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     rocprim::block_shuffle<T,BlockSize> b_shuffle;
     b_shuffle.template down<ItemsPerThread>(reinterpret_cast<T(&)[ItemsPerThread]>(device_input[index*ItemsPerThread]),reinterpret_cast<T(&)[ItemsPerThread]>(device_output[index*ItemsPerThread]));
 }
@@ -322,10 +322,10 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockDown)
     HIP_CHECK(hipSetDevice(device_id));
 
     using type = typename TestFixture::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 11;
-    const size_t grid_size = size / block_size;
-    constexpr unsigned int ItemsPerThread = 128;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 11;
+    static constexpr size_t grid_size = size / block_size;
+    static constexpr unsigned int ItemsPerThread = 128;
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
         unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
@@ -343,8 +343,8 @@ TYPED_TEST(RocprimBlockShuffleTests, BlockDown)
         type * device_output;
 
 
-        HIP_CHECK(hipMalloc(&device_input, input_data.size() * sizeof(type)));
-        HIP_CHECK(hipMalloc(&device_output, input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input_data.size() * sizeof(type)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), input_data.size() * sizeof(type)));
 
 
 
diff --git a/test/rocprim/test_block_sort.cpp b/test/rocprim/test_block_sort.cpp
index bb4853ac2..712c15439 100644
--- a/test/rocprim/test_block_sort.cpp
+++ b/test/rocprim/test_block_sort.cpp
@@ -48,7 +48,7 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_key_kernel(key_type * device_key_output)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     key_type key = device_key_output[index];
     rocprim::block_sort<key_type, BlockSize> bsort;
     bsort.sort(key);
@@ -63,7 +63,7 @@ TYPED_TEST(RocprimBlockSortTests, SortKey)
 
     using key_type = typename TestFixture::key_type;
     using binary_op_type = typename std::conditional<std::is_same<key_type, rocprim::half>::value, test_utils::half_less, rocprim::less<key_type>>::type;
-    const size_t block_size = TestFixture::block_size;
+    static constexpr size_t block_size = TestFixture::block_size;
     const size_t size = block_size * 1134;
     const size_t grid_size = size / block_size;
 
@@ -144,7 +144,7 @@ __global__
 __launch_bounds__(BlockSize)
 void sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     key_type key = device_key_output[index];
     value_type value = device_value_output[index];
     rocprim::block_sort<key_type, BlockSize, value_type> bsort;
@@ -163,9 +163,9 @@ TYPED_TEST(RocprimBlockSortTests, SortKeyValue)
     using value_type = typename TestFixture::value_type;
     using value_op_type = typename std::conditional<std::is_same<value_type, rocprim::half>::value, test_utils::half_less, rocprim::less<value_type>>::type;
     using eq_op_type = typename std::conditional<std::is_same<key_type, rocprim::half>::value, test_utils::half_equal_to, rocprim::equal_to<key_type>>::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 1134;
+    static constexpr size_t grid_size = size / block_size;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -288,7 +288,7 @@ __global__
 __launch_bounds__(BlockSize)
 void custom_sort_key_value_kernel(key_type * device_key_output, value_type * device_value_output)
 {
-    const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * BlockSize) + threadIdx.x;
     key_type key = device_key_output[index];
     value_type value = device_value_output[index];
     rocprim::block_sort<key_type, BlockSize, value_type> bsort;
@@ -307,9 +307,9 @@ TYPED_TEST(RocprimBlockSortTests, CustomSortKeyValue)
     using value_type = typename TestFixture::value_type;
     using value_op_type = typename std::conditional<std::is_same<value_type, rocprim::half>::value, test_utils::half_less, rocprim::less<value_type>>::type;
     using eq_op_type = typename std::conditional<std::is_same<key_type, rocprim::half>::value, test_utils::half_equal_to, rocprim::equal_to<key_type>>::type;
-    const size_t block_size = TestFixture::block_size;
-    const size_t size = block_size * 1134;
-    const size_t grid_size = size / block_size;
+    static constexpr size_t block_size = TestFixture::block_size;
+    static constexpr size_t size = block_size * 1134;
+    static constexpr size_t grid_size = size / block_size;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
diff --git a/test/rocprim/test_constant_iterator.cpp b/test/rocprim/test_constant_iterator.cpp
index e3e970b3e..45957eaef 100644
--- a/test/rocprim/test_constant_iterator.cpp
+++ b/test/rocprim/test_constant_iterator.cpp
@@ -123,7 +123,8 @@ TYPED_TEST(RocprimConstantIteratorTests, Transform)
             }
             else if(std::is_floating_point<T>::value)
             {
-                auto tolerance = std::max<T>(std::abs(0.1f * expected[i]), T(test_utils::precision_threshold<T>::percentage));
+                float percentage = test_utils::precision_threshold<T>::percentage;
+                auto tolerance = std::max(std::abs(0.1f * (float)expected[i]), (float)percentage);
                 ASSERT_NEAR(output[i], expected[i], tolerance) << "where index = " << i;
             }
         }
diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp
index 22449b1c7..52151028f 100644
--- a/test/rocprim/test_device_binary_search.cpp
+++ b/test/rocprim/test_device_binary_search.cpp
@@ -106,7 +106,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound)
             SCOPED_TRACE(testing::Message() << "with size = " << size);
 
             const size_t haystack_size = size;
-            const size_t needles_size = std::sqrt(size);
+            const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning
             const size_t d = haystack_size / 100;
 
             // Generate data
@@ -216,7 +216,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound)
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
-        unsigned int seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
+        seed_type seed_value = seed_index < random_seeds_count  ? rand() : seeds[seed_index - random_seeds_count];
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         for(size_t size : get_sizes(seed_value))
@@ -228,7 +228,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound)
             }
             SCOPED_TRACE(testing::Message() << "with size = " << size);
             const size_t haystack_size = size;
-            const size_t needles_size = std::sqrt(size);
+            const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning
             const size_t d = haystack_size / 100;
 
             // Generate data
@@ -351,7 +351,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch)
             SCOPED_TRACE(testing::Message() << "with size = " << size);
 
             const size_t haystack_size = size;
-            const size_t needles_size = std::sqrt(size);
+            const size_t needles_size = (size_t)std::sqrt(size); // cast promises no data loss, silences warning
             const size_t d = haystack_size / 100;
 
             // Generate data
diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp
index ff535e283..6a01cb639 100644
--- a/test/rocprim/test_device_histogram.cpp
+++ b/test/rocprim/test_device_histogram.cpp
@@ -221,7 +221,7 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even)
                     const level_type s = static_cast<level_type>(sample);
                     if(s >= lower_level && s < upper_level)
                     {
-                        const int bin = (s - lower_level) / scale;
+                        const level_type bin = (s - lower_level) / scale;
                         histogram_expected[bin]++;
                     }
                 }
@@ -235,7 +235,7 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even)
                 HIP_CHECK(
                     rocprim::histogram_even<config>(
                         nullptr, temporary_storage_bytes,
-                        d_input, columns,
+                        d_input, static_cast<unsigned int>(columns),
                         d_histogram,
                         bins + 1, lower_level, upper_level,
                         stream, debug_synchronous
diff --git a/test/rocprim/test_device_merge.cpp b/test/rocprim/test_device_merge.cpp
index b60972417..37d87d887 100644
--- a/test/rocprim/test_device_merge.cpp
+++ b/test/rocprim/test_device_merge.cpp
@@ -130,7 +130,7 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey)
             std::vector<key_type> keys_input2 = test_utils::get_random_data<key_type>(size2, 0, size2, seed_value);
             std::sort(keys_input1.begin(), keys_input1.end(), compare_op);
             std::sort(keys_input2.begin(), keys_input2.end(), compare_op);
-            std::vector<key_type> keys_output(size1 + size2, 0);
+            std::vector<key_type> keys_output(size1 + size2, (key_type)0);
 
             // Calculate expected results on host
             std::vector<key_type> expected(keys_output.size());
@@ -276,8 +276,8 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue)
             std::vector<value_type> values_input2(size2);
             std::iota(values_input1.begin(), values_input1.end(), 0);
             std::iota(values_input2.begin(), values_input2.end(), size1);
-            std::vector<key_type> keys_output(size1 + size2, 0);
-            std::vector<value_type> values_output(size1 + size2, 0);
+            std::vector<key_type> keys_output(size1 + size2, (key_type)0);
+            std::vector<value_type> values_output(size1 + size2, (value_type)0);
 
             // Calculate expected results on host
             std::vector<key_value> vector1(size1);
diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp
index 6c981921a..531ef7baa 100644
--- a/test/rocprim/test_device_partition.cpp
+++ b/test/rocprim/test_device_partition.cpp
@@ -396,11 +396,11 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate)
             {
                 if(select_op(input[i]))
                 {
-                    expected_selected.push_back(input[i]);
+                    expected_selected.push_back((U)input[i]);
                 }
                 else
                 {
-                    expected_rejected.push_back(input[i]);
+                    expected_rejected.push_back((U)input[i]);
                 }
             }
             std::reverse(expected_rejected.begin(), expected_rejected.end());
diff --git a/test/rocprim/test_device_radix_sort.cpp b/test/rocprim/test_device_radix_sort.cpp
index 238aec2d8..3cf70fdf8 100644
--- a/test/rocprim/test_device_radix_sort.cpp
+++ b/test/rocprim/test_device_radix_sort.cpp
@@ -27,6 +27,7 @@
 
 // required test headers
 #include "test_utils_types.hpp"
+#include "test_sort_comparator.hpp"
 
 template<
     class Key,
@@ -84,48 +85,6 @@ typedef ::testing::Types<
 
 TYPED_TEST_SUITE(RocprimDeviceRadixSort, Params);
 
-template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_comparator
-{
-    static_assert(rocprim::is_unsigned<Key>::value, "Test supports start and end bits only for unsigned integers");
-
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        auto mask = (1ull << (EndBit - StartBit)) - 1;
-        auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
-        auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
-        return Descending ? (r < l) : (l < r);
-    }
-};
-
-template<class Key, bool Descending>
-struct key_comparator<Key, Descending, 0, sizeof(Key) * 8>
-{
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        return Descending ? (rhs < lhs) : (lhs < rhs);
-    }
-};
-
-template<bool Descending>
-struct key_comparator<rocprim::half, Descending, 0, sizeof(rocprim::half) * 8>
-{
-    bool operator()(const rocprim::half& lhs, const rocprim::half& rhs)
-    {
-        // HIP's half doesn't have __host__ comparison operators, use floats instead
-        return key_comparator<float, Descending, 0, sizeof(float) * 8>()(lhs, rhs);
-    }
-};
-
-template<class Key, class Value, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_value_comparator
-{
-    bool operator()(const std::pair<Key, Value>& lhs, const std::pair<Key, Value>& rhs)
-    {
-        return key_comparator<Key, Descending, StartBit, EndBit>()(lhs.first, rhs.first);
-    }
-};
-
 std::vector<size_t> get_sizes(int seed_value)
 {
     std::vector<size_t> sizes = { 0, 1, 10, 53, 211, 1024, 2345, 4096, 34567, (1 << 16) - 1220, (1 << 23) - 76543 };
diff --git a/test/rocprim/test_device_reduce.cpp b/test/rocprim/test_device_reduce.cpp
index 95c0902d9..836098029 100644
--- a/test/rocprim/test_device_reduce.cpp
+++ b/test/rocprim/test_device_reduce.cpp
@@ -174,7 +174,7 @@ TYPED_TEST(RocprimDeviceReduceTests, Reduce)
 
             // Generate data
             std::vector<T> input = test_utils::get_random_data<T>(size, 1, 100, seed_value);
-            std::vector<U> output(1, 0);
+            std::vector<U> output(1, (U)0);
 
             // reduce function
             binary_op_type plus_op;
@@ -278,7 +278,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum)
 
             // Generate data
             std::vector<T> input = test_utils::get_random_data<T>(size, 1, 100, seed_value);
-            std::vector<U> output(1, 0);
+            std::vector<U> output(1, (U)0);
 
             T * d_input;
             U * d_output;
@@ -415,8 +415,8 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum)
             std::vector<key_value> input(size);
             for (size_t i = 0; i < size; i++)
             {
-                input[i].key = i;
-                input[i].value = test_utils::get_random_value<T>(1, 100, seed_value);
+                input[i].key = (int)i;
+                input[i].value = test_utils::get_random_data<T>(1, 1, 100, seed_value)[0];
             }
             std::vector<key_value> output(1);
 
diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp
index bcfeedfdc..cb5cf38a1 100644
--- a/test/rocprim/test_device_reduce_by_key.cpp
+++ b/test/rocprim/test_device_reduce_by_key.cpp
@@ -132,7 +132,14 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey)
     using key_distribution_type = typename std::conditional<
         std::is_floating_point<key_inner_type>::value,
         std::uniform_real_distribution<key_inner_type>,
-        std::uniform_int_distribution<key_inner_type>
+        typename std::conditional<
+            test_utils::is_valid_for_int_distribution<key_inner_type>::value,
+            std::uniform_int_distribution<key_inner_type>,
+            typename std::conditional<std::is_signed<key_inner_type>::value,
+                std::uniform_int_distribution<int>,
+                std::uniform_int_distribution<unsigned int>
+            >::type
+        >::type
     >::type;
 
     constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator;
diff --git a/test/rocprim/test_device_run_length_encode.cpp b/test/rocprim/test_device_run_length_encode.cpp
index c3125a15d..986167478 100644
--- a/test/rocprim/test_device_run_length_encode.cpp
+++ b/test/rocprim/test_device_run_length_encode.cpp
@@ -104,7 +104,14 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode)
     using key_distribution_type = typename std::conditional<
         std::is_floating_point<key_inner_type>::value,
         std::uniform_real_distribution<key_inner_type>,
-        std::uniform_int_distribution<key_inner_type>
+        typename std::conditional<
+            test_utils::is_valid_for_int_distribution<key_inner_type>::value,
+            std::uniform_int_distribution<key_inner_type>,
+            typename std::conditional<std::is_signed<key_inner_type>::value,
+                std::uniform_int_distribution<int>,
+                std::uniform_int_distribution<unsigned int>
+            >::type
+        >::type
     >::type;
 
     constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator;
@@ -159,7 +166,7 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, Encode)
 
                 unique_expected.push_back(current_key);
                 runs_count_expected++;
-                counts_expected.push_back(key_count);
+                counts_expected.push_back(static_cast<count_type>(key_count));
 
                 offset += key_count;
             }
@@ -268,7 +275,14 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns)
     using key_distribution_type = typename std::conditional<
         std::is_floating_point<key_inner_type>::value,
         std::uniform_real_distribution<key_inner_type>,
-        std::uniform_int_distribution<key_inner_type>
+        typename std::conditional<
+            test_utils::is_valid_for_int_distribution<key_inner_type>::value,
+            std::uniform_int_distribution<key_inner_type>,
+            typename std::conditional<std::is_signed<key_inner_type>::value,
+                std::uniform_int_distribution<int>,
+                std::uniform_int_distribution<unsigned int>
+            >::type
+        >::type
     >::type;
 
     constexpr bool use_identity_iterator = TestFixture::params::use_identity_iterator;
@@ -333,9 +347,9 @@ TYPED_TEST(RocprimDeviceRunLengthEncode, NonTrivialRuns)
 
                 if(key_count > 1)
                 {
-                    offsets_expected.push_back(offset);
+                    offsets_expected.push_back(static_cast<offset_type>(offset));
                     runs_count_expected++;
-                    counts_expected.push_back(key_count);
+                    counts_expected.push_back(static_cast<count_type>(key_count));
                 }
 
                 offset += key_count;
diff --git a/test/rocprim/test_device_segmented_radix_sort.cpp b/test/rocprim/test_device_segmented_radix_sort.cpp
index 513e81812..6a462dec0 100644
--- a/test/rocprim/test_device_segmented_radix_sort.cpp
+++ b/test/rocprim/test_device_segmented_radix_sort.cpp
@@ -27,6 +27,7 @@
 
 // required test headers
 #include "test_utils_types.hpp"
+#include "test_sort_comparator.hpp"
 
 template<
     class Key,
@@ -83,48 +84,6 @@ typedef ::testing::Types<
 
 TYPED_TEST_SUITE(RocprimDeviceSegmentedRadixSort, Params);
 
-template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_comparator
-{
-    static_assert(rocprim::is_unsigned<Key>::value, "Test supports start and end bits only for unsigned integers");
-
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        auto mask = (1ull << (EndBit - StartBit)) - 1;
-        auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
-        auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
-        return Descending ? (r < l) : (l < r);
-    }
-};
-
-template<class Key, bool Descending>
-struct key_comparator<Key, Descending, 0, sizeof(Key) * 8>
-{
-    bool operator()(const Key& lhs, const Key& rhs)
-    {
-        return Descending ? (rhs < lhs) : (lhs < rhs);
-    }
-};
-
-template<bool Descending>
-struct key_comparator<rocprim::half, Descending, 0, sizeof(rocprim::half) * 8>
-{
-    bool operator()(const rocprim::half& lhs, const rocprim::half& rhs)
-    {
-        // HIP's half doesn't have __host__ comparison operators, use floats instead
-        return key_comparator<float, Descending, 0, sizeof(float) * 8>()(lhs, rhs);
-    }
-};
-
-template<class Key, class Value, bool Descending, unsigned int StartBit, unsigned int EndBit>
-struct key_value_comparator
-{
-    bool operator()(const std::pair<Key, Value>& lhs, const std::pair<Key, Value>& rhs)
-    {
-        return key_comparator<Key, Descending, StartBit, EndBit>()(lhs.first, rhs.first);
-    }
-};
-
 std::vector<size_t> get_sizes(int seed_value)
 {
     std::vector<size_t> sizes = {
@@ -146,9 +105,9 @@ TYPED_TEST(RocprimDeviceSegmentedRadixSort, SortKeys)
     HIP_CHECK(hipSetDevice(device_id));
 
     using key_type = typename TestFixture::params::key_type;
-    constexpr bool descending = TestFixture::params::descending;
-    constexpr unsigned int start_bit = TestFixture::params::start_bit;
-    constexpr unsigned int end_bit = TestFixture::params::end_bit;
+    static constexpr bool descending = TestFixture::params::descending;
+    static constexpr unsigned int start_bit = TestFixture::params::start_bit;
+    static constexpr unsigned int end_bit = TestFixture::params::end_bit;
 
     using offset_type = unsigned int;
 
diff --git a/test/rocprim/test_device_segmented_reduce.cpp b/test/rocprim/test_device_segmented_reduce.cpp
index 3feda27ff..7f6f6b792 100644
--- a/test/rocprim/test_device_segmented_reduce.cpp
+++ b/test/rocprim/test_device_segmented_reduce.cpp
@@ -109,7 +109,7 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce)
     using result_type = output_type;
     using offset_type = unsigned int;
 
-    const input_type init = TestFixture::params::init;
+    const input_type init = (input_type)TestFixture::params::init;
     const bool debug_synchronous = false;
     reduce_op_type reduce_op;
 
diff --git a/test/rocprim/test_device_select.cpp b/test/rocprim/test_device_select.cpp
index 57a1e18de..2c5cab4a3 100644
--- a/test/rocprim/test_device_select.cpp
+++ b/test/rocprim/test_device_select.cpp
@@ -405,7 +405,7 @@ TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput)
         rocprim::unique(
             nullptr,
             temp_storage_size_bytes,
-            rocprim::make_constant_iterator<T>(123),
+            rocprim::make_constant_iterator<T>((T)123),
             rocprim::make_discard_iterator(),
             d_selected_count_output,
             0,
@@ -423,7 +423,7 @@ TYPED_TEST(RocprimDeviceSelectTests, UniqueEmptyInput)
         rocprim::unique(
             d_temp_storage,
             temp_storage_size_bytes,
-            rocprim::make_constant_iterator<T>(123),
+            rocprim::make_constant_iterator<T>((T)123),
             rocprim::make_discard_iterator(),
             d_selected_count_output,
             0,
@@ -658,10 +658,10 @@ TEST(RocprimDeviceSelectTests, UniqueGuardedOperator)
                 F * d_flag;
                 U * d_output;
                 unsigned int * d_selected_count_output;
-                HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T)));
-                HIP_CHECK(hipMalloc(&d_flag, input_flag.size() * sizeof(F)));
-                HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(U)));
-                HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int)));
+                HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), input.size() * sizeof(T)));
+                HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_flag), input_flag.size() * sizeof(F)));
+                HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), input.size() * sizeof(U)));
+                HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_selected_count_output), sizeof(unsigned int)));
                 HIP_CHECK(
                     hipMemcpy(
                         d_input, input.data(),
diff --git a/test/rocprim/test_device_transform.cpp b/test/rocprim/test_device_transform.cpp
index 34f86c3f4..a91c69087 100644
--- a/test/rocprim/test_device_transform.cpp
+++ b/test/rocprim/test_device_transform.cpp
@@ -141,7 +141,7 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform)
 
             // Generate data
             std::vector<T> input = test_utils::get_random_data<T>(size, 1, 100, seed_value);
-            std::vector<U> output(input.size(), 0);
+            std::vector<U> output(input.size(), (U)0);
 
             T * d_input;
             U * d_output;
@@ -247,7 +247,7 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform)
             // Generate data
             std::vector<T1> input1 = test_utils::get_random_data<T1>(size, 1, 100, seed_value);
             std::vector<T2> input2 = test_utils::get_random_data<T2>(size, 1, 100, seed_value);
-            std::vector<U> output(input1.size(), 0);
+            std::vector<U> output(input1.size(), (U)0);
 
             T1 * d_input1;
             T2 * d_input2;
diff --git a/test/rocprim/test_intrinsics.cpp b/test/rocprim/test_intrinsics.cpp
index a33e2b681..4d5391231 100644
--- a/test/rocprim/test_intrinsics.cpp
+++ b/test/rocprim/test_intrinsics.cpp
@@ -52,7 +52,7 @@ inline bool operator==(const custom_notaligned& lhs,
 }
 
 // Custom structure aligned to 16 bytes
-struct custom_16aligned
+struct alignas(16) custom_16aligned
 {
     int i;
     unsigned int u;
@@ -62,7 +62,7 @@ struct custom_16aligned
     custom_16aligned() {};
     ROCPRIM_HOST_DEVICE
     ~custom_16aligned() {};
-} __attribute__((aligned(16)));
+};
 
 inline ROCPRIM_HOST_DEVICE
 bool operator==(const custom_16aligned& lhs, const custom_16aligned& rhs)
@@ -98,7 +98,7 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void shuffle_up_kernel(T* data, unsigned int delta, unsigned int width)
 {
-    const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
     T value = data[index];
     value = rocprim::warp_shuffle_up(value, delta, width);
     data[index] = value;
@@ -201,7 +201,7 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void shuffle_down_kernel(T* data, unsigned int delta, unsigned int width)
 {
-    const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
     T value = data[index];
     value = rocprim::warp_shuffle_down(value, delta, width);
     data[index] = value;
@@ -304,10 +304,10 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void shuffle_index_kernel(T* data, int* src_lanes, unsigned int width)
 {
-    const unsigned int index = (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    const unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
     T value = data[index];
     value = rocprim::warp_shuffle(
-        value, src_lanes[hipThreadIdx_x/width], width
+        value, src_lanes[threadIdx.x/width], width
     );
     data[index] = value;
 }
@@ -436,10 +436,10 @@ TEST(RocprimIntrinsicsTests, ShuffleUpCustomStruct)
         std::vector<T> output(input.size());
         for(size_t i = 0; i < 4 * input.size(); i+=4)
         {
-            input[i/4].i = random_data[i];
+            input[i/4].i = (short)random_data[i];
             input[i/4].d = random_data[i+1];
-            input[i/4].f = random_data[i+2];
-            input[i/4].u = random_data[i+3];
+            input[i/4].f = (float)random_data[i+2];
+            input[i/4].u = (unsigned int)random_data[i+3];
         }
 
         T* device_data;
@@ -536,9 +536,9 @@ TEST(RocprimIntrinsicsTests, ShuffleUpCustomAlignedStruct)
         std::vector<T> output(input.size());
         for(size_t i = 0; i < 3 * input.size(); i+=3)
         {
-            input[i/3].i = random_data[i];
-            input[i/3].u = random_data[i+1];
-            input[i/3].f = random_data[i+2];
+            input[i/3].i = (int)random_data[i];
+            input[i/3].u = (unsigned int)random_data[i+1];
+            input[i/3].f = (float)random_data[i+2];
         }
 
         T* device_data;
diff --git a/test/rocprim/test_seed.hpp b/test/rocprim/test_seed.hpp
index c968d94e1..f1eeb10d5 100644
--- a/test/rocprim/test_seed.hpp
+++ b/test/rocprim/test_seed.hpp
@@ -21,8 +21,11 @@
 #ifndef TEST_SEED_HPP_
 #define TEST_SEED_HPP_
 
-static constexpr int random_seeds_count = 1;
-static constexpr unsigned int seeds [] = {0, 1997132004};
+using engine_type = std::default_random_engine;
+using seed_type = typename engine_type::result_type;
+
+static constexpr size_t random_seeds_count = 1;
+static constexpr seed_type seeds [] = {0, 1997132004};
 static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]);
 
 #endif // TEST_SEED_HPP_
diff --git a/test/rocprim/test_sort_comparator.hpp b/test/rocprim/test_sort_comparator.hpp
new file mode 100644
index 000000000..118faa7e1
--- /dev/null
+++ b/test/rocprim/test_sort_comparator.hpp
@@ -0,0 +1,177 @@
+// MIT License
+//
+// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef TEST_SORT_COMPARATOR_HPP_
+#define TEST_SORT_COMPARATOR_HPP_
+
+#include <rocprim/type_traits.hpp>
+
+// Original C++17 logic
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//struct key_comparator
+//{
+//    bool operator()(const Key& lhs, const Key& rhs)
+//    {
+//        if constexpr (rocprim::is_unsigned<Key>::value)
+//        {
+//            if constexpr (StartBit == 0 && (EndBit == sizeof(Key) * 8))
+//            {
+//                return Descending ? (rhs < lhs) : (lhs < rhs);
+//            }
+//            else
+//            {
+//                auto mask = (1ull << (EndBit - StartBit)) - 1;
+//                auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
+//                auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
+//                return Descending ? (r < l) : (l < r);
+//            }
+//        }
+//        else
+//        {
+//            if constexpr (std::is_same_v<Key, rocprim::half>)
+//            {
+//                float l = static_cast<float>(lhs);
+//                float r = static_cast<float>(rhs);
+//                return Descending ? (r < l) : (l < r);
+//            }
+//            else
+//            {
+//                return Descending ? (rhs < lhs) : (lhs < rhs);
+//            }
+//        }
+//    }
+//};
+
+// Faulty C++14 backported logic (consider fixing)
+//
+//template<class Key, bool Descending>
+//bool generic_key_compare(const Key& lhs, const Key& rhs) { return Descending ? (rhs < lhs) : (lhs < rhs); }
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//auto discriminate_bits(const Key& lhs, const Key& rhs) -> typename std::enable_if<StartBit == 0 && EndBit == sizeof(Key) * 8, bool>::type
+//{
+//    // TODO: pick adequately sized integral type (instead of 1ull) based on Key.
+//    //       Needed to safely silence "'argument': conversion from 'unsigned __int64' to 'const Key', possible loss of data"
+//    auto mask = (1ull << (EndBit - StartBit)) - 1;
+//    auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
+//    auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
+//    return generic_key_compare<Key, Descending>(l, r);
+//}
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//auto discriminate_bits(const Key& lhs, const Key& rhs) -> typename std::enable_if<!(StartBit == 0 && EndBit == sizeof(Key) * 8), bool>::type
+//{
+//    return generic_key_compare<Key, Descending>(lhs, rhs);
+//}
+//
+//template<class Key, bool Descending>
+//auto discriminate_half(const Key& lhs, const Key& rhs) -> typename std::enable_if<std::is_same<Key, rocprim::half>::value, bool>::type
+//{
+//    // HIP's half doesn't have __host__ comparison operators, use floats instead
+//    return generic_key_compare<float, Descending>((float)lhs, (float)rhs);
+//}
+//
+//template<class Key, bool Descending>
+//auto discriminate_half(const Key& lhs, const Key& rhs) -> typename std::enable_if<!std::is_same<Key, rocprim::half>::value, bool>::type
+//{
+//    return generic_key_compare<Key, Descending>(lhs, rhs);
+//}
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//auto discriminate_unsigned(const Key& lhs, const Key& rhs) -> typename std::enable_if<rocprim::is_unsigned<Key>::value, bool>::type
+//{
+//    return discriminate_bits<Key, Descending, StartBit, EndBit>(lhs, rhs);
+//}
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//auto discriminate_unsigned(const Key& lhs, const Key& rhs) -> typename std::enable_if<!rocprim::is_unsigned<Key>::value, bool>::type
+//{
+//    return discriminate_half<Key, Descending>(lhs, rhs);
+//}
+//
+//template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//struct key_comparator
+//{
+//    bool operator()(const Key& lhs, const Key& rhs)
+//    {
+//        return discriminate_unsigned<Key, Descending, StartBit, EndBit>(lhs, rhs);
+//    }
+//};
+//
+//template<class Key, class Value, bool Descending, unsigned int StartBit, unsigned int EndBit>
+//struct key_value_comparator
+//{
+//    bool operator()(const std::pair<Key, Value>& lhs, const std::pair<Key, Value>& rhs)
+//    {
+//        return key_comparator<Key, Descending, StartBit, EndBit>()(lhs.first, rhs.first);
+//    }
+//};
+
+// Original code with ISO-conforming overload control
+//
+// NOTE: ShiftLess helper is needed, because partial specializations cannot refer to the free template args.
+//       See: https://stackoverflow.com/questions/2615905/c-template-nontype-parameter-arithmetic
+
+template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit, bool ShiftLess = (StartBit == 0 && EndBit == sizeof(Key) * 8)>
+struct key_comparator
+{
+    static_assert(rocprim::is_unsigned<Key>::value, "Test supports start and end bits only for unsigned integers");
+
+    bool operator()(const Key& lhs, const Key& rhs)
+    {
+        auto mask = (1ull << (EndBit - StartBit)) - 1;
+        auto l = (static_cast<unsigned long long>(lhs) >> StartBit) & mask;
+        auto r = (static_cast<unsigned long long>(rhs) >> StartBit) & mask;
+        return Descending ? (r < l) : (l < r);
+    }
+};
+
+template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+struct key_comparator<Key, Descending, StartBit, EndBit, true>
+{
+    bool operator()(const Key& lhs, const Key& rhs)
+    {
+        return Descending ? (rhs < lhs) : (lhs < rhs);
+    }
+};
+
+template<bool Descending>
+struct key_comparator<rocprim::half, Descending, 0, sizeof(rocprim::half) * 8>
+{
+    bool operator()(const rocprim::half& lhs, const rocprim::half& rhs)
+    {
+        // HIP's half doesn't have __host__ comparison operators, use floats instead
+        return key_comparator<float, Descending, 0, sizeof(float) * 8>()(lhs, rhs);
+    }
+};
+
+template<class Key, class Value, bool Descending, unsigned int StartBit, unsigned int EndBit>
+struct key_value_comparator
+{
+    bool operator()(const std::pair<Key, Value>& lhs, const std::pair<Key, Value>& rhs)
+    {
+        return key_comparator<Key, Descending, StartBit, EndBit>()(lhs.first, rhs.first);
+    }
+};
+
+#endif // TEST_SORT_COMPARATOR_HPP_
diff --git a/test/rocprim/test_texture_cache_iterator.cpp b/test/rocprim/test_texture_cache_iterator.cpp
index 0fa927b68..3613eb0bc 100644
--- a/test/rocprim/test_texture_cache_iterator.cpp
+++ b/test/rocprim/test_texture_cache_iterator.cpp
@@ -89,7 +89,7 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform)
 
         for(size_t i = 0; i < size; i++)
         {
-            input[i] = T(test_utils::get_random_value(1, 200, seed_value));
+            input[i] = test_utils::get_random_value<T>(1, 200, seed_value);
         }
 
         std::vector<T> output(size);
diff --git a/test/rocprim/test_thread.cpp b/test/rocprim/test_thread.cpp
index b32504cd1..72dd5b557 100644
--- a/test/rocprim/test_thread.cpp
+++ b/test/rocprim/test_thread.cpp
@@ -86,10 +86,10 @@ void flat_id_kernel(unsigned int* device_output)
 TYPED_TEST(RocprimThreadTests, FlatBlockThreadID)
 {
     using Type = unsigned int;
-    constexpr size_t block_size_x = TestFixture::params::block_size_x;
-    constexpr size_t block_size_y = TestFixture::params::block_size_y;
-    constexpr size_t block_size_z = TestFixture::params::block_size_z;
-    constexpr size_t block_size = block_size_x * block_size_y * block_size_z;
+    static constexpr size_t block_size_x = TestFixture::params::block_size_x;
+    static constexpr size_t block_size_y = TestFixture::params::block_size_y;
+    static constexpr size_t block_size_z = TestFixture::params::block_size_z;
+    static constexpr size_t block_size = block_size_x * block_size_y * block_size_z;
     // Given block size not supported
     if(block_size > test_utils::get_max_block_size() || (block_size & (block_size - 1)) != 0)
     {
@@ -151,7 +151,7 @@ __launch_bounds__(1024)
 void block_id_kernel(unsigned int* device_output)
 {
     unsigned int block_id = rocprim::flat_block_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
-    if(hipThreadIdx_x)
+    if(threadIdx.x)
     {
         device_output[block_id] = block_id;
     }
@@ -160,10 +160,10 @@ void block_id_kernel(unsigned int* device_output)
 TYPED_TEST(RocprimThreadTests, FlatBlockID)
 {
     using Type = unsigned int;
-    constexpr size_t block_size_x = TestFixture::params::block_size_x;
-    constexpr size_t block_size_y = TestFixture::params::block_size_y;
-    constexpr size_t block_size_z = TestFixture::params::block_size_z;
-    constexpr size_t block_size = block_size_x * block_size_y * block_size_z;
+    static constexpr size_t block_size_x = TestFixture::params::block_size_x;
+    static constexpr size_t block_size_y = TestFixture::params::block_size_y;
+    static constexpr size_t block_size_z = TestFixture::params::block_size_z;
+    static constexpr size_t block_size = block_size_x * block_size_y * block_size_z;
     const size_t size = block_size * block_size;
     const auto grid_size = size / block_size;
 
diff --git a/test/rocprim/test_thread_algos.cpp b/test/rocprim/test_thread_algos.cpp
index edcfe647d..48b14109a 100644
--- a/test/rocprim/test_thread_algos.cpp
+++ b/test/rocprim/test_thread_algos.cpp
@@ -65,16 +65,16 @@ template<class Type>
 __global__
 void thread_load_kernel(Type* volatile const device_input, Type* device_output)
 {
-    size_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
     device_output[index] = rocprim::thread_load<rocprim::load_cg>(device_input + index);
 }
 
 TYPED_TEST(RocprimThreadOperationTests, Load)
 {
     using T = typename TestFixture::type;
-    constexpr uint32_t block_size = 256;
-    constexpr uint32_t grid_size = 128;
-    constexpr uint32_t size = block_size * grid_size;
+    static constexpr uint32_t block_size = 256;
+    static constexpr uint32_t grid_size = 128;
+    static constexpr uint32_t size = block_size * grid_size;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -90,9 +90,9 @@ TYPED_TEST(RocprimThreadOperationTests, Load)
 
         // Preparing device
         T* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(T)));
         T* device_output;
-        HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), output.size() * sizeof(T)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -102,7 +102,11 @@ TYPED_TEST(RocprimThreadOperationTests, Load)
             )
         );
 
-        thread_load_kernel<T><<<grid_size, block_size>>>(device_input, device_output);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(thread_load_kernel<T>),
+            grid_size, block_size, 0, 0,
+            device_input, device_output
+        );
 
         // Reading results back
         HIP_CHECK(
@@ -128,16 +132,16 @@ template<class Type>
 __global__
 void thread_store_kernel(Type* const device_input, Type* device_output)
 {
-    size_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
     rocprim::thread_store<rocprim::store_wb>(device_output + index, device_input[index]);
 }
 
 TYPED_TEST(RocprimThreadOperationTests, Store)
 {
     using T = typename TestFixture::type;
-    constexpr uint32_t block_size = 256;
-    constexpr uint32_t grid_size = 128;
-    constexpr uint32_t size = block_size * grid_size;
+    static constexpr uint32_t block_size = 256;
+    static constexpr uint32_t grid_size = 128;
+    static constexpr uint32_t size = block_size * grid_size;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
     {
@@ -153,9 +157,9 @@ TYPED_TEST(RocprimThreadOperationTests, Store)
 
         // Preparing device
         T* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(T)));
         T* device_output;
-        HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), output.size() * sizeof(T)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -165,7 +169,11 @@ TYPED_TEST(RocprimThreadOperationTests, Store)
             )
         );
 
-        thread_store_kernel<T><<<grid_size, block_size>>>(device_input, device_output);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(thread_store_kernel<T>),
+            grid_size, block_size, 0, 0,
+            device_input, device_output
+        );
 
         // Reading results back
         HIP_CHECK(
@@ -201,18 +209,18 @@ template<class Type, int32_t Length>
 __global__
 void thread_reduce_kernel(Type* const device_input, Type* device_output)
 {
-    size_t input_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length;
-    size_t output_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length;
+    size_t input_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length;
+    size_t output_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length;
     device_output[output_index] = rocprim::thread_reduce<Length>(&device_input[input_index], sum_op());
 }
 
 TYPED_TEST(RocprimThreadOperationTests, Reduction)
 {
     using T = typename TestFixture::type;
-    constexpr uint32_t length = 4;
-    constexpr uint32_t block_size = 128 / length;
-    constexpr uint32_t grid_size = 128;
-    constexpr uint32_t size = block_size * grid_size * length;
+    static constexpr uint32_t length = 4;
+    static constexpr uint32_t block_size = 128 / length;
+    static constexpr uint32_t grid_size = 128;
+    static constexpr uint32_t size = block_size * grid_size * length;
     sum_op operation;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
@@ -243,9 +251,9 @@ TYPED_TEST(RocprimThreadOperationTests, Reduction)
 
         // Preparing device
         T* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(T)));
         T* device_output;
-        HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), output.size() * sizeof(T)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -255,7 +263,11 @@ TYPED_TEST(RocprimThreadOperationTests, Reduction)
             )
         );
 
-        thread_reduce_kernel<T, length><<<grid_size, block_size>>>(device_input, device_output);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(thread_reduce_kernel<T, length>),
+            grid_size, block_size, 0, 0,
+            device_input, device_output
+        );
 
         // Reading results back
         HIP_CHECK(
@@ -282,8 +294,8 @@ template<class Type, int32_t Length>
 __global__
 void thread_scan_kernel(Type* const device_input, Type* device_output)
 {
-    size_t input_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length;
-    size_t output_index = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * Length;
+    size_t input_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length;
+    size_t output_index = (blockIdx.x * blockDim.x + threadIdx.x) * Length;
 
     rocprim::thread_scan_inclusive<Length>(&device_input[input_index],
                                                   &device_output[output_index],
@@ -293,10 +305,10 @@ void thread_scan_kernel(Type* const device_input, Type* device_output)
 TYPED_TEST(RocprimThreadOperationTests, Scan)
 {
     using T = typename TestFixture::type;
-    constexpr uint32_t length = 4;
-    constexpr uint32_t block_size = 128 / length;
-    constexpr uint32_t grid_size = 128;
-    constexpr uint32_t size = block_size * grid_size * length;
+    static constexpr uint32_t length = 4;
+    static constexpr uint32_t block_size = 128 / length;
+    static constexpr uint32_t grid_size = 128;
+    static constexpr uint32_t size = block_size * grid_size * length;
     sum_op operation;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
@@ -327,9 +339,9 @@ TYPED_TEST(RocprimThreadOperationTests, Scan)
 
         // Preparing device
         T* device_input;
-        HIP_CHECK(hipMalloc(&device_input, input.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_input), input.size() * sizeof(T)));
         T* device_output;
-        HIP_CHECK(hipMalloc(&device_output, output.size() * sizeof(T)));
+        HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&device_output), output.size() * sizeof(T)));
 
         HIP_CHECK(
             hipMemcpy(
@@ -339,7 +351,11 @@ TYPED_TEST(RocprimThreadOperationTests, Scan)
             )
         );
 
-        thread_scan_kernel<T, length><<<grid_size, block_size>>>(device_input, device_output);
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(thread_scan_kernel<T, length>),
+            grid_size, block_size, 0, 0,
+            device_input, device_output
+        );
 
         // Reading results back
         HIP_CHECK(
diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp
index 702ffc71e..a0e2f76ff 100644
--- a/test/rocprim/test_utils.hpp
+++ b/test/rocprim/test_utils.hpp
@@ -61,13 +61,13 @@ struct precision_threshold<rocprim::half>
 // Support half operators on host side
 
 ROCPRIM_HOST inline
-_Float16 half_to_native(const rocprim::half& x)
+rocprim::native_half half_to_native(const rocprim::half& x)
 {
-    return *reinterpret_cast<const _Float16 *>(&x);
+    return *reinterpret_cast<const rocprim::native_half *>(&x);
 }
 
 ROCPRIM_HOST inline
-rocprim::half native_to_half(const _Float16& x)
+rocprim::half native_to_half(const rocprim::native_half& x)
 {
     return *reinterpret_cast<const rocprim::half *>(&x);
 }
@@ -215,23 +215,43 @@ struct half_minimum
     }
 };
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, int seed_value)
+// std::uniform_int_distribution is undefined for anything other than listed
+// https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
+template <typename T>
+struct is_valid_for_int_distribution :
+    std::integral_constant<bool,
+        std::is_same<short, T>::value ||
+        std::is_same<unsigned short, T>::value ||
+        std::is_same<int, T>::value ||
+        std::is_same<unsigned int, T>::value ||
+        std::is_same<long, T>::value ||
+        std::is_same<unsigned long, T>::value ||
+        std::is_same<long long, T>::value ||
+        std::is_same<unsigned long long, T>::value
+    > {};
+
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
     -> typename std::enable_if<rocprim::is_integral<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
-    std::uniform_int_distribution<T> distribution(min, max);
+    engine_type gen{seed_value};
+    using dis_type = typename std::conditional<
+        is_valid_for_int_distribution<T>::value,
+        T,
+        typename std::conditional<std::is_signed<T>::value,
+            int,
+            unsigned int>::type
+        >::type;
+    std::uniform_int_distribution<dis_type> distribution((dis_type)min, (dis_type)max);
     std::vector<T> data(size);
-    uint32_t segment_size = size / random_data_generation_segments;
+    size_t segment_size = size / random_data_generation_segments;
     if(segment_size != 0)
     {
         for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
         {
             if(segment_index % random_data_generation_repeat_strides == 0)
             {
-                T repeated_value = distribution(gen);
+                T repeated_value = static_cast<T>(distribution(gen));
                 std::fill(
                     data.begin() + segment_size * segment_index,
                     data.begin() + segment_size * (segment_index + 1),
@@ -243,36 +263,34 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value)
                 std::generate(
                     data.begin() + segment_size * segment_index,
                     data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return distribution(gen); });
+                    [&]() { return static_cast<T>(distribution(gen)); });
             }
         }
     }
     else
     {
-        std::generate(data.begin(), data.end(), [&]() { return distribution(gen); });
+        std::generate(data.begin(), data.end(), [&]() { return static_cast<T>(distribution(gen)); });
     }
     return data;
 }
 
-template<class T>
-inline auto get_random_data(size_t size, T min, T max, int seed_value)
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
     -> typename std::enable_if<rocprim::is_floating_point<T>::value, std::vector<T>>::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
+    engine_type gen{seed_value};
     // Generate floats when T is half
     using dis_type = typename std::conditional<std::is_same<rocprim::half, T>::value, float, T>::type;
-    std::uniform_real_distribution<dis_type> distribution(min, max);
+    std::uniform_real_distribution<dis_type> distribution((dis_type)min, (dis_type)max);
     std::vector<T> data(size);
-    uint32_t segment_size = size / random_data_generation_segments;
+    size_t segment_size = size / random_data_generation_segments;
     if(segment_size != 0)
     {
         for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
         {
             if(segment_index % random_data_generation_repeat_strides == 0)
             {
-                T repeated_value = distribution(gen);
+                T repeated_value = static_cast<T>(distribution(gen));
                 std::fill(
                     data.begin() + segment_size * segment_index,
                     data.begin() + segment_size * (segment_index + 1),
@@ -284,25 +302,23 @@ inline auto get_random_data(size_t size, T min, T max, int seed_value)
                 std::generate(
                     data.begin() + segment_size * segment_index,
                     data.begin() + segment_size * (segment_index + 1),
-                    [&]() { return distribution(gen); });
+                    [&]() { return static_cast<T>(distribution(gen)); });
             }
         }
     }
     else
     {
-        std::generate(data.begin(), data.end(), [&]() { return distribution(gen); });
+        std::generate(data.begin(), data.end(), [&]() { return static_cast<T>(distribution(gen)); });
 
     }
     return data;
 }
 
 template<class T>
-inline std::vector<T> get_random_data01(size_t size, float p, int seed_value)
+inline std::vector<T> get_random_data01(size_t size, float p, seed_type seed_value)
 {
     const size_t max_random_size = 1024 * 1024;
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
+    engine_type gen{seed_value};
     std::bernoulli_distribution distribution(p);
     std::vector<T> data(size);
     std::generate(
@@ -316,11 +332,11 @@ inline std::vector<T> get_random_data01(size_t size, float p, int seed_value)
     return data;
 }
 
-template<class T>
-inline auto get_random_value(T min, T max, int seed_value)
+template<class T, class U, class V>
+inline auto get_random_value(U min, V max, seed_type seed_value)
     -> typename std::enable_if<rocprim::is_arithmetic<T>::value, T>::type
 {
-    return get_random_data(random_data_generation_segments, min, max, seed_value)[0];
+    return get_random_data<T>(random_data_generation_segments, min, max, seed_value)[0];
 }
 
 // Can't use std::prefix_sum for inclusive/exclusive scan, because
@@ -772,18 +788,16 @@ struct numeric_limits : public std::conditional<
 };
 
 template<class T>
-inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value)
+inline auto get_random_data(size_t size, T min, T max, seed_type seed_value)
     -> typename std::enable_if<
            is_custom_test_type<T>::value && std::is_integral<typename T::value_type>::value,
            std::vector<T>
        >::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
-    std::uniform_int_distribution<typename T::value_type> distribution(min, max);
+    engine_type gen(seed_value);
+    std::uniform_int_distribution<typename T::value_type> distribution(min.x, max.x);
     std::vector<T> data(size);
-    uint32_t segment_size = size / random_data_generation_segments;
+    size_t segment_size = size / random_data_generation_segments;
     if(segment_size != 0)
     {
         for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
@@ -814,18 +828,16 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T:
 }
 
 template<class T>
-inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value)
+inline auto get_random_data(size_t size, T min, T max, seed_type seed_value)
     -> typename std::enable_if<
            is_custom_test_type<T>::value && std::is_floating_point<typename T::value_type>::value,
            std::vector<T>
        >::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
-    std::uniform_real_distribution<typename T::value_type> distribution(min, max);
+    engine_type gen(seed_value);
+    std::uniform_real_distribution<typename T::value_type> distribution(min.x, max.x);
     std::vector<T> data(size);
-    uint32_t segment_size = size / random_data_generation_segments;
+    size_t segment_size = size / random_data_generation_segments;
     if(segment_size != 0)
     {
         for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; segment_index++)
@@ -856,15 +868,13 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T:
 }
 
 template<class T>
-inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, int seed_value)
+inline auto get_random_data(size_t size, typename T::value_type min, typename T::value_type max, seed_type seed_value)
     -> typename std::enable_if<
            is_custom_test_array_type<T>::value && std::is_integral<typename T::value_type>::value,
            std::vector<T>
        >::type
 {
-    std::random_device rd;
-    std::default_random_engine gen(rd());
-    gen.seed(seed_value);
+    engine_type gen(seed_value);
     std::uniform_int_distribution<typename T::value_type> distribution(min, max);
     std::vector<T> data(size);
     std::generate(
@@ -883,10 +893,10 @@ inline auto get_random_data(size_t size, typename T::value_type min, typename T:
 }
 
 template<class T>
-inline auto get_random_value(typename T::value_type min, typename T::value_type max, int seed_value)
+inline auto get_random_value(typename T::value_type min, typename T::value_type max, seed_type seed_value)
     -> typename std::enable_if<is_custom_test_type<T>::value || is_custom_test_array_type<T>::value, T>::type
 {
-    return get_random_data(random_data_generation_segments, min, max, seed_value)[0];
+    return get_random_data<typename T::value_type>(random_data_generation_segments, min, max, seed_value)[0];
 }
 
 template<class T>
diff --git a/test/rocprim/test_warp_reduce.cpp b/test/rocprim/test_warp_reduce.cpp
index 1f938ddb4..dc07a3bb1 100644
--- a/test/rocprim/test_warp_reduce.cpp
+++ b/test/rocprim/test_warp_reduce.cpp
@@ -45,9 +45,9 @@ __global__
 __launch_bounds__(BlockSize)
 void warp_reduce_sum_kernel(T* device_input, T* device_output)
 {
-    constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
+    static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -55,7 +55,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output)
     __shared__ typename wreduce_t::storage_type storage[warps_no];
     wreduce_t().reduce(value, value, storage[warp_id]);
 
-    if(hipThreadIdx_x%LogicalWarpSize == 0)
+    if(threadIdx.x%LogicalWarpSize == 0)
     {
         device_output[index/LogicalWarpSize] = value;
     }
@@ -70,28 +70,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum)
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -109,15 +109,15 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSum)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value); // used for input
-        std::vector<T> output(input.size() / logical_warp_size, 0);
+        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
+        std::vector<T> output(input.size() / logical_warp_size, (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 1);
+        std::vector<T> expected(output.size(), (T)1);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size(); i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < logical_warp_size; j++)
             {
                 auto idx = i * logical_warp_size + j;
@@ -186,9 +186,9 @@ __global__
 __launch_bounds__(BlockSize)
 void warp_allreduce_sum_kernel(T* device_input, T* device_output)
 {
-    constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
+    static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -208,28 +208,28 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum)
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -247,15 +247,15 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSum)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value); // used for input
-        std::vector<T> output(input.size(), 0);
+        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
+        std::vector<T> output(input.size(), (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / logical_warp_size; i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < logical_warp_size; j++)
             {
                 auto idx = i * logical_warp_size + j;
@@ -328,9 +328,9 @@ __global__
 __launch_bounds__(BlockSize)
 void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid)
 {
-    constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
+    static constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -338,7 +338,7 @@ void warp_reduce_sum_kernel(T* device_input, T* device_output, size_t valid)
     __shared__ typename wreduce_t::storage_type storage[warps_no];
     wreduce_t().reduce(value, value, valid, storage[warp_id]);
 
-    if(hipThreadIdx_x%LogicalWarpSize == 0)
+    if(threadIdx.x%LogicalWarpSize == 0)
     {
         device_output[index/LogicalWarpSize] = value;
     }
@@ -353,28 +353,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid)
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
     const size_t valid = logical_warp_size - 1;
 
@@ -393,15 +393,15 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumValid)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value); // used for input
-        std::vector<T> output(input.size() / logical_warp_size, 0);
+        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
+        std::vector<T> output(input.size() / logical_warp_size, (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 1);
+        std::vector<T> expected(output.size(), (T)1);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size(); i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < valid; j++)
             {
                 auto idx = i * logical_warp_size + j;
@@ -472,7 +472,7 @@ void warp_allreduce_sum_kernel(T* device_input, T* device_output, size_t valid)
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -492,28 +492,28 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid)
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
     const size_t valid = logical_warp_size - 1;
 
@@ -532,15 +532,15 @@ TYPED_TEST(RocprimWarpReduceTests, AllReduceSumValid)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value); // used for input
-        std::vector<T> output(input.size(), 0);
+        std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
+        std::vector<T> output(input.size(), (T)0);
 
         // Calculate expected results on host
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
         binary_op_type binary_op;
         for(size_t i = 0; i < output.size() / logical_warp_size; i++)
         {
-            T value = 0;
+            T value = (T)0;
             for(size_t j = 0; j < valid; j++)
             {
                 auto idx = i * logical_warp_size + j;
@@ -614,28 +614,28 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct)
     using T = test_utils::custom_test_type<base_type>;
 
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -669,7 +669,7 @@ TYPED_TEST(RocprimWarpReduceTests, ReduceSumCustomStruct)
         std::vector<T> expected(output.size());
         for(size_t i = 0; i < output.size(); i++)
         {
-            T value(0, 0);
+            T value{(base_type)0, (base_type)0};
             for(size_t j = 0; j < logical_warp_size; j++)
             {
                 auto idx = i * logical_warp_size + j;
@@ -741,7 +741,7 @@ void head_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output)
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = input[index];
     auto flag = flags[index];
@@ -763,28 +763,28 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     using flag_type = unsigned char;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -802,7 +802,7 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 1, 10, seed_value); // used for input
+        std::vector<T> input = test_utils::get_random_data<T>(size, 1, 10, seed_value);
         std::vector<flag_type> flags = test_utils::get_random_data01<flag_type>(size, 0.25f, seed_value);
         for(size_t i = 0; i < flags.size(); i+= logical_warp_size)
         {
@@ -883,8 +883,8 @@ TYPED_TEST(RocprimWarpReduceTests, HeadSegmentedReduceSum)
         );
         HIP_CHECK(hipDeviceSynchronize());
 
-        std::vector<T> output_segment(output.size(), 0);
-        std::vector<T> expected_segment(output.size(), 0);
+        std::vector<T> output_segment(output.size(), (T)0);
+        std::vector<T> expected_segment(output.size(), (T)0);
         for(size_t i = 0; i < output.size(); i++)
         {
             if(flags[i])
@@ -914,7 +914,7 @@ void tail_segmented_warp_reduce_kernel(T* input, Flag* flags, T* output)
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = input[index];
     auto flag = flags[index];
@@ -936,28 +936,28 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     using flag_type = unsigned char;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -975,7 +975,7 @@ TYPED_TEST(RocprimWarpReduceTests, TailSegmentedReduceSum)
         SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
 
         // Generate data
-        std::vector<T> input = test_utils::get_random_data<T>(size, 1, 10, seed_value); // used for input
+        std::vector<T> input = test_utils::get_random_data<T>(size, 1, 10, seed_value);
         std::vector<flag_type> flags = test_utils::get_random_data01<flag_type>(size, 0.25f, seed_value);
         for(size_t i = logical_warp_size - 1; i < flags.size(); i+= logical_warp_size)
         {
diff --git a/test/rocprim/test_warp_scan.cpp b/test/rocprim/test_warp_scan.cpp
index 5406e07f6..371d72f62 100644
--- a/test/rocprim/test_warp_scan.cpp
+++ b/test/rocprim/test_warp_scan.cpp
@@ -51,7 +51,7 @@ void warp_inclusive_scan_kernel(T* device_input, T* device_output)
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -71,23 +71,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -112,7 +112,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScan)
         // Generate data
         std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
         std::vector<T> output(size);
-        std::vector<T> expected(output.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -192,7 +192,7 @@ void warp_inclusive_scan_reduce_kernel(
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + ( hipBlockIdx_x * BlockSize );
+    unsigned int index = threadIdx.x + ( blockIdx.x * BlockSize );
 
     T value = device_input[index];
     T reduction;
@@ -202,7 +202,7 @@ void warp_inclusive_scan_reduce_kernel(
     wscan_t().inclusive_scan(value, value, reduction, storage[warp_id]);
 
     device_output[index] = value;
-    if((hipThreadIdx_x % LogicalWarpSize) == 0)
+    if((threadIdx.x % LogicalWarpSize) == 0)
     {
         device_output_reductions[index / LogicalWarpSize] = reduction;
     }
@@ -217,23 +217,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -259,8 +259,8 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanReduce)
         std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
         std::vector<T> output(size);
         std::vector<T> output_reductions(size / logical_warp_size);
-        std::vector<T> expected(output.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
+        std::vector<T> expected(output.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -355,7 +355,7 @@ void warp_exclusive_scan_kernel(T* device_input, T* device_output, T init)
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
 
@@ -375,23 +375,23 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -416,8 +416,8 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveScan)
         // Generate data
         std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
         std::vector<T> output(size);
-        std::vector<T> expected(input.size(), 0);
-        const T init = test_utils::get_random_value(0, 100, seed_value);
+        std::vector<T> expected(input.size(), (T)0);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -499,7 +499,7 @@ void warp_exclusive_scan_reduce_kernel(
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T value = device_input[index];
     T reduction;
@@ -509,7 +509,7 @@ void warp_exclusive_scan_reduce_kernel(
     wscan_t().exclusive_scan(value, value, init, reduction, storage[warp_id]);
 
     device_output[index] = value;
-    if((hipThreadIdx_x % LogicalWarpSize) == 0)
+    if((threadIdx.x % LogicalWarpSize) == 0)
     {
         device_output_reductions[index / LogicalWarpSize] = reduction;
     }
@@ -524,23 +524,23 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -566,9 +566,9 @@ TYPED_TEST(RocprimWarpScanTests, ExclusiveReduceScan)
         std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
         std::vector<T> output(size);
         std::vector<T> output_reductions(size / logical_warp_size);
-        std::vector<T> expected(input.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
-        const T init = test_utils::get_random_value(0, 100, seed_value);
+        std::vector<T> expected(input.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -673,7 +673,7 @@ void warp_scan_kernel(
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T input = device_input[index];
     T inclusive_output, exclusive_output;
@@ -695,23 +695,23 @@ TYPED_TEST(RocprimWarpScanTests, Scan)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -737,9 +737,9 @@ TYPED_TEST(RocprimWarpScanTests, Scan)
         std::vector<T> input = test_utils::get_random_data<T>(size, 2, 50, seed_value);
         std::vector<T> output_inclusive(size);
         std::vector<T> output_exclusive(size);
-        std::vector<T> expected_inclusive(output_inclusive.size(), 0);
-        std::vector<T> expected_exclusive(output_exclusive.size(), 0);
-        const T init = test_utils::get_random_value(0, 100, seed_value);
+        std::vector<T> expected_inclusive(output_inclusive.size(), (T)0);
+        std::vector<T> expected_exclusive(output_exclusive.size(), (T)0);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -848,7 +848,7 @@ void warp_scan_reduce_kernel(
 {
     constexpr unsigned int warps_no = BlockSize / LogicalWarpSize;
     const unsigned int warp_id = rocprim::detail::logical_warp_id<LogicalWarpSize>();
-    unsigned int index = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
     T input = device_input[index];
     T inclusive_output, exclusive_output, reduction;
@@ -859,7 +859,7 @@ void warp_scan_reduce_kernel(
 
     device_inclusive_output[index] = inclusive_output;
     device_exclusive_output[index] = exclusive_output;
-    if((hipThreadIdx_x % LogicalWarpSize) == 0)
+    if((threadIdx.x % LogicalWarpSize) == 0)
     {
         device_output_reductions[index / LogicalWarpSize] = reduction;
     }
@@ -874,23 +874,23 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce)
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_plus, rocprim::plus<T>>::type;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -917,10 +917,10 @@ TYPED_TEST(RocprimWarpScanTests, ScanReduce)
         std::vector<T> output_inclusive(size);
         std::vector<T> output_exclusive(size);
         std::vector<T> output_reductions(size / logical_warp_size);
-        std::vector<T> expected_inclusive(output_inclusive.size(), 0);
-        std::vector<T> expected_exclusive(output_exclusive.size(), 0);
-        std::vector<T> expected_reductions(output_reductions.size(), 0);
-        const T init = test_utils::get_random_value(0, 100, seed_value);
+        std::vector<T> expected_inclusive(output_inclusive.size(), (T)0);
+        std::vector<T> expected_exclusive(output_exclusive.size(), (T)0);
+        std::vector<T> expected_reductions(output_reductions.size(), (T)0);
+        const T init = test_utils::get_random_value<T>(0, 100, seed_value);
 
         // Calculate expected results on host
         binary_op_type binary_op;
@@ -1042,23 +1042,23 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType)
     using base_type = typename TestFixture::params::type;
     using T = test_utils::custom_test_type<base_type>;
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     // Block size of warp size 32
-    constexpr size_t block_size_ws32 =
+    static constexpr size_t block_size_ws32 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws32, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws32/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws32/logical_warp_size), 1) * logical_warp_size;
 
     // Block size of warp size 64
-    constexpr size_t block_size_ws64 =
+    static constexpr size_t block_size_ws64 =
         rocprim::detail::is_power_of_two(logical_warp_size)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
-            : rocprim::max<size_t>((ws64/logical_warp_size) * logical_warp_size, 1);
+            : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
 
@@ -1083,7 +1083,7 @@ TYPED_TEST(RocprimWarpScanTests, InclusiveScanCustomType)
         // Generate data
         std::vector<T> input(size);
         std::vector<T> output(size);
-        std::vector<T> expected(output.size(), T(0));
+        std::vector<T> expected(output.size(), (base_type)0);
         // Initializing input data
         {
             auto random_values =
diff --git a/test/rocprim/test_warp_sort.cpp b/test/rocprim/test_warp_sort.cpp
index f144747a1..4cfb859d0 100644
--- a/test/rocprim/test_warp_sort.cpp
+++ b/test/rocprim/test_warp_sort.cpp
@@ -60,7 +60,7 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void test_hip_warp_sort(T* d_output)
 {
-    unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int i = threadIdx.x + (blockIdx.x * blockDim.x);
     T value = d_output[i];
     rocprim::warp_sort<T, LogicalWarpSize> wsort;
     wsort.sort(value);
@@ -76,16 +76,16 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, Sort)
     // logical warp side for warp primitive, execution warp size is always rocprim::warp_size()
     using T = typename TestFixture::params::type;
     using binary_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_less, rocprim::less<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
     const size_t block_size = std::max<size_t>(current_device_warp_size, logical_warp_size * 4);
 
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
@@ -163,7 +163,7 @@ __global__
 __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE)
 void test_hip_sort_key_value_kernel(KeyType* d_output_key, ValueType* d_output_value)
 {
-    unsigned int i = hipThreadIdx_x + (hipBlockIdx_x * hipBlockDim_x);
+    unsigned int i = threadIdx.x + (blockIdx.x * blockDim.x);
     KeyType key = d_output_key[i];
     ValueType value = d_output_value[i];
     rocprim::warp_sort<KeyType, LogicalWarpSize, ValueType> wsort;
@@ -183,16 +183,16 @@ TYPED_TEST(RocprimWarpSortShuffleBasedTests, SortKeyInt)
     using pair = test_utils::custom_test_type<T>;
     using value_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_less, rocprim::less<T>>::type;
     using eq_op_type = typename std::conditional<std::is_same<T, rocprim::half>::value, test_utils::half_equal_to, rocprim::equal_to<T>>::type;
-    constexpr size_t logical_warp_size = TestFixture::params::warp_size;
+    static constexpr size_t logical_warp_size = TestFixture::params::warp_size;
 
     // The different warp sizes
-    constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
-    constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
+    static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
+    static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
     const unsigned int current_device_warp_size = rocprim::host_warp_size();
     const size_t block_size = std::max<size_t>(current_device_warp_size, logical_warp_size * 4);
 
-    constexpr unsigned int grid_size = 4;
+    static constexpr unsigned int grid_size = 4;
     const size_t size = block_size * grid_size;
 
     // Check if warp size is supported
diff --git a/test/test_hip_api.cpp b/test/test_hip_api.cpp
index c4ac83f8a..6f8264ade 100644
--- a/test/test_hip_api.cpp
+++ b/test/test_hip_api.cpp
@@ -23,7 +23,7 @@
 #include "common_test_header.hpp"
 
 template<class T>
-T ax(const T a, const T x) __device__
+__device__ T ax(const T a, const T x)
 {
     return x * a;
 }
@@ -32,7 +32,7 @@ template <class T>
 __global__
 void saxpy_kernel(const T * x, T * y, const T a, const size_t size)
 {
-    const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(i < size)
     {
         y[i] += ax(a, x[i]);