Skip to content

Commit

Permalink
Add Hip-Cpu support initial (#233)
Browse files Browse the repository at this point in the history
* Update benchmark config tuning code

* Move options up front

* Remove DISABLE_WERROR option

* Update summary

* Add to README as experimental back-end

* IMPORTED target based dependency definition

* [HIP-CPU] MSVC: error C1128 (/bigobj)

* [HIP-CPU] CUDA-style indexing and kernel launch

* [HIP-CPU] ambiguous call to overloaded function

* cannot convert argument 1 from 'T **' to 'void **'

* [HIP-CPU] unknown pragma 'unroll'

* [HIP-CPU] half has explicit CTOR

* [HIP-CPU] MSVC: 'name' hides previous declaration

* [HIP-CPU] MSVC: error C2975

* [HIP-CPU] MSVC: C2365 (redefinition of symbol)

* [HIP-CPU] rocprim::half_native for plats w/o one

* [HIP-CPU] ISO C++ alignment specification

* TYPED_TEST_CASE is deprecated

* Silence conversion warning

* [HIP-CPU] Note on optimization opportunity

* uniform_int_distribution is undefined for (u)int8

* [HIP-CPU] omit texture support

* [HIP-CPU] MSVC: expr SFINAE bug on auto ret type

* [HIP-CPU] Type check too strict. (Diagnostic only)

* [HIP-CPU] Empty binary op for dummy operations

* [HIP-CPU] Zero out potentially used uninit storage

* More easily replacable PRNG engine

* [HIP-CPU] MSVC: function isn't constexpr enough

* [HIP-CPU] MSVC: error C2440

* [HIP-CPU] MSVC: workaround

* [HIP-CPU] Port of clang/hipcc intrinsics

* [HIP-CPU] MSVC: STL misuse fix (C4996)

* [HIP-CPU] Fix various conversion warnings

* ambiguous call to function

* [HIP-CPU] Guard against missing header

* Add missing utility funtion for MSVC

* Add distinct default type

* Potential use of uninitialized storage

* [HIP-CPU] Strictly ISO conforming overload control

* Erronous use of token paste preprocessor operator

* Define-friendly pragma (no)unroll

* Refrain from C++17 utility std::disjunction

* [HIP-CPU] libstdc++: Depend on TBB unconditionally

* [HIP-CPU] GCC: Preliminary impl of __lastbit

* [HIP-CPU] Hotfix lack of dpp built-ins

* error: attributes are not allowed on a function-definition

* Fix rebase error

* Extend lane_mask_type to non-hipcc compilers

* [HIP-CPU] Host compiler pretend to be hipcc

* [HIP-CPU] GCC: decl of x changes meaning of symbol

* [HIP-CPU] CI testing HIP-CPU build only

* Temporarily reroute to HIP-CPU fork

* error C3861: 'uint': identifier not found

* error C3861: 'min': identifier not found

* disambiguate function call

* ISO C++ alignment specification

* Prevent shared memory from slipping to size == 0

* Bump GTest version dep

* Inherit build and library type

* Issue error when using multi-conf gen with dep mgt

* Downgrade TBB dependence to match libstc++

* [HIP-CPU] GCC: Circumvent load/store_volatile

* [HIP-CPU] Remove static_assert type-check

* [HIP-CPU] MSVC: Guard against inline asm

* [HIP-CPU] Clang/hipcc: use diff half impls

* Reinstantiate -Werror via CI script

* Removed unused local type aliases

* [HIP-CPU] Revert to original comparator with fix

* Remove implicit cast (truncation)

* [HIP-CPU] Limit CPU builds to fit runner RAM

* Disable hip-cpu build step from gitlab CI

* Handle size_t in cmdparser.hpp

* Change hip-cpu url

* Added experimental HIP-CPU support to changelog

* Compilation fix for gfx1030

Co-authored-by: Máté Ferenc Nagy-Egri <[email protected]>
Co-authored-by: Stanley Tsang <[email protected]>
  • Loading branch information
3 people authored Jun 19, 2021
1 parent 125ea52 commit 7f2bba5
Show file tree
Hide file tree
Showing 131 changed files with 2,408 additions and 1,542 deletions.
46 changes: 46 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,25 @@ variables:
paths:
- $CMAKE_LATEST_PATH

.deps-cpu:cmake-latest:
stage: build:cmake_latest
before_script:
- $SUDO_CMD apt update -qq
- $SUDO_CMD apt install -y -qq apt-transport-https software-properties-common
- $SUDO_CMD add-apt-repository ppa:ubuntu-toolchain-r/test
# | Used in the script | Build tools | Fetch from https:// | rocminfo calls lsmod
- $SUDO_CMD apt install -y -qq wget tar xz-utils bzip2 libnuma-dev libunwind-dev git build-essential pkg-config ninja-build ca-certificates kmod g++-9
# Fetch CMake only if the cache has not been restored
- if [ ! -d $CMAKE_LATEST_PATH ]; then mkdir -p $CMAKE_LATEST_PATH; wget --no-check-certificate --quiet -O - $CMAKE_LATEST_URL | tar --strip-components=1 -xz -C $CMAKE_LATEST_PATH;
- fi;
- export PATH=$CMAKE_LATEST_PATH/bin:$PATH
# Debug printing of environment for context when errors occur
- hipconfig
cache:
key: $CMAKE_LATEST_VERSION
paths:
- $CMAKE_LATEST_PATH

build:cmake-latest:
extends:
- .deps:cmake-latest
Expand All @@ -92,6 +111,7 @@ build:cmake-latest:
- cmake
-G Ninja
-D CMAKE_CXX_COMPILER=hipcc
-D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
-D CMAKE_BUILD_TYPE=Release
-D BUILD_TEST=ON
-D BUILD_EXAMPLE=ON
Expand All @@ -112,6 +132,31 @@ build:cmake-latest:
- $BUILD_LATEST_DIR/CTestTestfile.cmake
expire_in: 2 weeks

# TODO: Enable the hip-cpu CI step
#build-cpu:cmake-latest:
# extends:
# - .deps-cpu:cmake-latest
# tags:
# - s9300
# - rocm
# script:
# - mkdir -p $BUILD_LATEST_DIR
# - cd $BUILD_LATEST_DIR
# - cmake
# -G Ninja
# -D CMAKE_CXX_COMPILER=g++-9
# -D CMAKE_CXX_FLAGS="-Wall -Wextra"
# -D CMAKE_BUILD_TYPE=Release
# -D BUILD_TEST=ON
# -D BUILD_EXAMPLE=OFF
# -D BUILD_BENCHMARK=OFF
# -D USE_HIP_CPU=ON
# -S $CI_PROJECT_DIR
# -B $BUILD_LATEST_DIR
# - cmake
# --build $BUILD_LATEST_DIR
# --parallel 3

build:cmake-minimum:
extends:
- .deps:cmake-minimum
Expand All @@ -126,6 +171,7 @@ build:cmake-minimum:
- cmake
-G Ninja
-D CMAKE_CXX_COMPILER=hipcc
-D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror"
-D CMAKE_BUILD_TYPE=Release
-D BUILD_TEST=ON
-D BUILD_EXAMPLE=ON
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

Full documentation for rocPRIM is available at [https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/](https://codedocs.xyz/ROCmSoftwarePlatform/rocPRIM/)

## [Unreleased rocPRIM-Next]
### Added
- Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Win/Linux. It is work in progress, many algorithms still known to fail.

## [Unreleased rocPRIM-2.10.11 for ROCm 4.4.0]
### Added
- Code coverage tools build option
Expand Down
68 changes: 36 additions & 32 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm )
# rocPRIM project
project(rocprim LANGUAGES CXX)

# Build options
option(BUILD_TEST "Build tests (requires googletest)" OFF)
option(BUILD_BENCHMARK "Build benchmarks" OFF)
option(BUILD_EXAMPLE "Build examples" OFF)
option(USE_HIP_CPU "Prefer HIP-CPU runtime instead of HW acceleration" OFF)
# Disables building tests, benchmarks, examples
option(ONLY_INSTALL "Only install" OFF)

# CMake modules
list(APPEND CMAKE_MODULE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/cmake
Expand All @@ -46,36 +54,35 @@ endif()

set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath")

# Get dependencies
include(cmake/Dependencies.cmake)

# Set the AMDGPU_TARGETS with backward compatiblity
# Use target ID syntax if supported for AMDGPU_TARGETS
if(COMMAND rocm_check_target_ids)
rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
)
else()
# Detect compiler support for target ID
# This section is deprecated. Please use rocm_check_target_ids for future use.
if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help"
OUTPUT_VARIABLE CXX_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE)
string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT})
endif()
if(TARGET_ID_SUPPORT)
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-")
if(NOT USE_HIP_CPU)
# Set the AMDGPU_TARGETS with backward compatiblity
# Use target ID syntax if supported for AMDGPU_TARGETS
if(COMMAND rocm_check_target_ids)
rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
)
else()
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908")
# Detect compiler support for target ID
# This section is deprecated. Please use rocm_check_target_ids for future use.
if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
execute_process(COMMAND ${CMAKE_CXX_COMPILER} "--help"
OUTPUT_VARIABLE CXX_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE)
string(REGEX MATCH ".mcode\-object\-version" TARGET_ID_SUPPORT ${CXX_OUTPUT})
endif()
if(TARGET_ID_SUPPORT)
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx1030")
else()
set(DEFAULT_AMDGPU_TARGETS "gfx803;gfx900;gfx906;gfx908;gfx1030")
endif()
endif()
endif()
set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device
set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
set(AMDGPU_TEST_TARGETS "" CACHE STRING "List of specific device types to test for") # Leave empty for default system device

# Verify that hcc compiler is used on ROCM platform
include(cmake/VerifyCompiler.cmake)
# Verify that hcc compiler is used on ROCM platform
include(cmake/VerifyCompiler.cmake)
endif()

# Build options
# Disable -Werror
Expand All @@ -98,11 +105,8 @@ set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

if(DISABLE_WERROR)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
endif()
# Get dependencies
include(cmake/Dependencies.cmake)

# Setup VERSION
set(VERSION_STRING "2.10.9")
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ cd rocPRIM; mkdir build; cd build
# before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler.
# Using HIP-clang:
[CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../.
#
# ! EXPERIMENTAL !
# Alternatively one may build using the experimental (and highly incomplete) HIP-CPU back-end for host-side
# execution using any C++17 conforming compiler (supported by HIP-CPU). AMDGPU_* options are unavailable in this case.
# USE_HIP_CPU - OFF by default

# Build
make -j4
Expand Down
30 changes: 26 additions & 4 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,40 @@ option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various c

function(add_rocprim_benchmark BENCHMARK_SOURCE)
get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE)

add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE})

target_link_libraries(${BENCHMARK_TARGET}
PRIVATE
rocprim_hip
rocprim
benchmark::benchmark
)
foreach(amdgpu_target ${AMDGPU_TARGETS})
if(NOT USE_HIP_CPU)
target_link_libraries(${BENCHMARK_TARGET}
PRIVATE
rocprim_hip
)
else()
target_link_libraries(${BENCHMARK_TARGET}
PRIVATE
--amdgpu-target=${amdgpu_target}
Threads::Threads
hip_cpu_rt::hip_cpu_rt
)
endforeach()
if(STL_DEPENDS_ON_TBB)
target_link_libraries(${BENCHMARK_TARGET}
PRIVATE
TBB::tbb
)
endif()
endif()

target_compile_options(${BENCHMARK_TARGET}
PRIVATE
$<$<CXX_COMPILER_ID:MSVC>:
/bigobj # number of sections exceeded object file format limit: compile with /bigobj
>
)

set_target_properties(${BENCHMARK_TARGET}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark"
Expand Down
22 changes: 11 additions & 11 deletions benchmark/benchmark_block_discontinuity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ struct flag_heads
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
const unsigned int lid = threadIdx.x;
const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;

T input[ItemsPerThread];
rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);

#pragma nounroll
ROCPRIM_NO_UNROLL
for(unsigned int trial = 0; trial < Trials; trial++)
{
rp::block_discontinuity<T, BlockSize> bdiscontinuity;
Expand Down Expand Up @@ -125,13 +125,13 @@ struct flag_tails
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
const unsigned int lid = threadIdx.x;
const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;

T input[ItemsPerThread];
rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);

#pragma nounroll
ROCPRIM_NO_UNROLL
for(unsigned int trial = 0; trial < Trials; trial++)
{
rp::block_discontinuity<T, BlockSize> bdiscontinuity;
Expand Down Expand Up @@ -168,13 +168,13 @@ struct flag_heads_and_tails
__device__
static void run(const T * d_input, T * d_output)
{
const unsigned int lid = hipThreadIdx_x;
const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize;
const unsigned int lid = threadIdx.x;
const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize;

T input[ItemsPerThread];
rp::block_load_direct_striped<BlockSize>(lid, d_input + block_offset, input);

#pragma nounroll
ROCPRIM_NO_UNROLL
for(unsigned int trial = 0; trial < Trials; trial++)
{
rp::block_discontinuity<T, BlockSize> bdiscontinuity;
Expand Down Expand Up @@ -217,8 +217,8 @@ void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
std::vector<T> input = get_random_data<T>(size, T(0), T(10));
T * d_input;
T * d_output;
HIP_CHECK(hipMalloc(&d_input, size * sizeof(T)));
HIP_CHECK(hipMalloc(&d_output, size * sizeof(T)));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_input), size * sizeof(T)));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&d_output), size * sizeof(T)));
HIP_CHECK(
hipMemcpy(
d_input, input.data(),
Expand Down
Loading

0 comments on commit 7f2bba5

Please sign in to comment.