Skip to content

Commit

Permalink
Merge pull request #231 from ROCmSoftwarePlatform/staging
Browse files Browse the repository at this point in the history
Update master with staging
  • Loading branch information
stanleytsang-amd authored Jun 7, 2021
2 parents 83a1e14 + c0cd1e3 commit d853622
Show file tree
Hide file tree
Showing 116 changed files with 6,203 additions and 513 deletions.
107 changes: 104 additions & 3 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ stages:
- build:cmake_latest # Tests if builds succeed (CMake)
- build:cmake_minimum # Tests if builds succeed (CMake)
- test # Tests if unit tests are passing (CTest)
- benchmark

variables:
# Tested CMake versions
Expand All @@ -40,7 +41,7 @@ variables:
BUILD_DIR: $CI_PROJECT_DIR/build
CMAKE_MINIMUM_URL: "https://cmake.org/files/v3.10/cmake-3.10.2-Linux-x86_64.tar.gz"
CMAKE_MINIMUM_PATH: "$CI_PROJECT_DIR/deps/cmake-3.10.2"
CMAKE_LATEST_URL: "https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz"
CMAKE_LATEST_URL: "https://cmake.org/files/v3.17/cmake-3.17.0-Linux-x86_64.tar.gz"
CMAKE_LATEST_PATH: "$CI_PROJECT_DIR/deps/cmake-3.17.0"
BUILD_MINIMUM_DIR: $CI_PROJECT_DIR/build/cmake-3.10.2
BUILD_LATEST_DIR: $CI_PROJECT_DIR/build/cmake-3.17.0
Expand Down Expand Up @@ -128,7 +129,7 @@ build:cmake-minimum:
-D CMAKE_BUILD_TYPE=Release
-D BUILD_TEST=ON
-D BUILD_EXAMPLE=ON
-D BUILD_BENCHMARK=OFF
-D BUILD_BENCHMARK=ON
-D DISABLE_WERROR=OFF
-D AMDGPU_TEST_TARGETS="gfx803;gfx900;gfx906"
-B $BUILD_MINIMUM_DIR
Expand Down Expand Up @@ -181,8 +182,108 @@ build:package:
- $PACKAGE_DIR/rocprim*.zip
expire_in: 2 weeks

build:benchmark:
extends:
- .deps:cmake-latest
tags:
- mi25
- rocm
only:
- ci-benchmark-extend
- internal_benchmark
- develop_stream
- develop
- master
script:
- mkdir build
- cd build
# Build hipCUB benchmark
- cmake
-G Ninja
-D CMAKE_CXX_COMPILER=hipcc
-D CMAKE_BUILD_TYPE=Release
-D BUILD_TEST=OFF
-D BUILD_EXAMPLE=OFF
-D BUILD_BENCHMARK=ON
-D DISABLE_WERROR=OFF
-D AMDGPU_TARGETS="gfx803;gfx900;gfx906"
..
- cmake
--build .
artifacts:
paths:
- build/benchmark/*
- build/googlebenchmark/
expire_in: 2 weeks

include: '.gitlab-ci-gputest.yml'

benchmark_view:rocm_mi25:
extends:
- .deps:cmake-latest
stage: benchmark
when: manual
only:
- ci-benchmark-extend
- internal_benchmark
- develop_stream
- develop
- master
needs:
- build:benchmark
tags:
- mi25
- rocm
script:
- $SUDO_CMD cmake
-D BENCHMARK_BINARY_DIR=build/benchmark
-D BENCHMARK_OUTPUT_DIR=.
-P ${CI_PROJECT_DIR}/.gitlab/RunBenchmarks.cmake

benchmark_view:rocm_s9300:
extends:
- .deps:cmake-latest
stage: benchmark
when: manual
only:
- ci-benchmark-extend
- internal_benchmark
- develop_stream
- develop
- master
needs:
- build:benchmark
tags:
- s9300
- rocm
script:
- $SUDO_CMD cmake
-D BENCHMARK_BINARY_DIR=build/benchmark
-D BENCHMARK_OUTPUT_DIR=.
-P ${CI_PROJECT_DIR}/.gitlab/RunBenchmarks.cmake

benchmark_view:rocm_vega20:
extends:
- .deps:cmake-latest
stage: benchmark
when: manual
only:
- ci-benchmark-extend
- internal_benchmark
- develop_stream
- develop
- master
needs:
- build:benchmark
tags:
- vega20
- rocm
script:
- $SUDO_CMD cmake
-D BENCHMARK_BINARY_DIR=build/benchmark
-D BENCHMARK_OUTPUT_DIR=.
-P ${CI_PROJECT_DIR}/.gitlab/RunBenchmarks.cmake

test:deb:
stage: test
needs:
Expand All @@ -192,7 +293,7 @@ test:deb:
extends:
- .deps:cmake-minimum
script:
- $SUDO_CMD dpkg -i $PACKAGE_DIR/rocprim-*.deb
- $SUDO_CMD dpkg -i $PACKAGE_DIR/rocprim*.deb
- mkdir -p package_test && cd package_test
- cmake
-G Ninja
Expand Down
125 changes: 125 additions & 0 deletions .gitlab/RunBenchmarks.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Command-line argument processing
if(NOT BENCHMARK_BINARY_DIR)
message(STATUS "BENCHMARK_BINARY_DIR not provided, defaulting to working directory")
set(BENCHMARK_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
endif()
if(NOT BENCHMARK_OUTPUT_DIR)
message(STATUS "BENCHMARK_OUTPUT_DIR not provided, defaulting to BENCHMARK_BINARY_DIR")
set(BENCHMARK_OUTPUT_DIR ${BENCHMARK_BINARY_DIR})
endif()
if(NOT BENCHMARK_QUIET)
set(OUTPUT_QUIET OUTPUT_QUIET)
else()
set(OUTPUT_QUIET)
endif()

# Search for command-line tools
find_program(CURL_EXECUTABLE
NAMES curl
)
if(NOT CURL_EXECUTABLE)
message(FATAL_ERROR "curl executable not found. Please provide a path to it via CMAKE_PREFIX_PATH")
endif()

if(DEFINED ENV{CI_COMMIT_SHA})
set(GIT_HASH $ENV{CI_COMMIT_SHA})
message(STATUS "Environment has CI_COMMIT_SHA: $ENV{CI_COMMIT_SHA}")
else()
find_package(Git
REQUIRED
)
if(NOT GIT_FOUND)
message(FATAL_ERROR "git executable not found. Please provide a path to it via CMAKE_PREFIX_PATH")
endif()

execute_process(
COMMAND ${GIT_EXECUTABLE}
rev-parse HEAD
RESULT_VARIABLE GIT_EXIT_CODE
OUTPUT_VARIABLE GIT_HASH
ERROR_VARIABLE GIT_STDERR
)
if(NOT GIT_EXIT_CODE EQUAL 0)
message(FATAL_ERROR "git rev-parse HEAD returned exit code ${GIT_EXIT_CODE}")
else()
message(STATUS "git rev-parse HEAD reported hash: ${GIT_HASH}")
endif()
endif()

string(STRIP ${GIT_HASH} GIT_HASH)

# Benchmark processing
file(GLOB BENCHMARKS "${BENCHMARK_BINARY_DIR}/benchmark_*")
foreach(BENCHMARK IN LISTS BENCHMARKS)
get_filename_component(BENCHMARK_NAME "${BENCHMARK}" NAME_WE)

if(BENCHMARK_REST_ENDPOINT) # else() not needed, as default is console dump.
set(BENCHMARK_ARGS "--benchmark_format=json --benchmark_out=${BENCHMARK_OUTPUT_DIR}/${BENCHMARK_NAME}.json")
endif()

message(STATUS "Running ${BENCHMARK}")
execute_process(
COMMAND ${BENCHMARK}
${BENCHMARK_ARGS}
RESULT_VARIABLE BENCHMARK_EXIT_CODE
OUTPUT_VARIABLE BENCHMARK_STDOUT
ERROR_VARIABLE BENCHMARK_STDERR
)
if(NOT BENCHMARK_EXIT_CODE EQUAL 0)
message(FATAL_ERROR "${BENCHMARK_NAME} returned exit code ${BENCHMARK_EXIT_CODE}" "Stdout:\n${BENCHMARK_STDOUT}" "Stderr:\n${BENCHMARK_STDERR}")
endif()
if(NOT BENCHMARK_QUIET)
message(STATUS "${BENCHMARK_STDOUT}")
endif()

if(NOT BENCHMARK_REST_ENDPOINT)
continue() # If we're not submitting, no need to enrich and send over the wire.
endif()

# Enrich data
#
# NOTE: regex matching first line instead of entire BENCHMARK_STDOUT
# because searching up until the first newline character in any
# way is borked.
string(FIND "${BENCHMARK_STDOUT}" "\n" FIRST_NEWLINE)
string(SUBSTRING "${BENCHMARK_STDOUT}" 0 ${FIRST_NEWLINE} FIRST_LINE)
string(REGEX MATCH
[[^\[(HIP|CUDA)\] Device name: (.*)$]]
DEVICE_MATCH
"${FIRST_LINE}"
)
if(CMAKE_MATCH_0)
set(DEVICE_NAME "${CMAKE_MATCH_2}")
else()
message(FATAL_ERROR "Device name not found on console output of ${BENCHMARK_NAME}. Output was:" "${BENCHMARK_STDOUT}")
endif()

file(READ
${BENCHMARK_OUTPUT_DIR}/${BENCHMARK_NAME}.json
BENCHMARK_FILEOUT
)
string(REGEX REPLACE
[[("context": {)]]
"\"context\": {\n \"device\": \"${DEVICE_NAME}\",\n \"hash\": \"${GIT_HASH}\"," JSON_PAYLOAD
"${BENCHMARK_FILEOUT}"
)
file(WRITE
${BENCHMARK_OUTPUT_DIR}/${BENCHMARK_NAME}.json
"${JSON_PAYLOAD}"
)

# Submit data
execute_process(
COMMAND ${CURL_EXECUTABLE}
--header "Content-Type: application/json"
--request POST
--data-binary "@${BENCHMARK_OUTPUT_DIR}/${BENCHMARK_NAME}.json"
${BENCHMARK_REST_ENDPOINT}
RESULT_VARIABLE CURL_EXIT_CODE
OUTPUT_VARIABLE CURL_STDOUT
ERROR_VARIABLE CURL_STDERR
)
if(NOT CURL_EXIT_CODE EQUAL 0)
message(FATAL_ERROR "curl returned exit code ${CURL_EXIT_CODE}")
endif()
endforeach()
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ Full documentation for rocPRIM is available at [https://codedocs.xyz/ROCmSoftwar
## [rocPRIM-2.10.9 for ROCm 4.2.0]
### Fixed
- Size zero inputs are now properly handled with newer ROCm builds that no longer allow zero-size kernel grid/block dimensions
- Device scan unit test failure fixed
### Changed
- Minimum cmake version required is now 3.10.2
### Known issues
- Device scan unit test currently failing due to LLVM bug.

## [rocPRIM-2.10.8 for ROCm 4.1.0]
### Fixed
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ include(cmake/Dependencies.cmake)
# Use target ID syntax if supported for AMDGPU_TARGETS
if(COMMAND rocm_check_target_ids)
rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+"
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
)
else()
# Detect compiler support for target ID
Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark_block_discontinuity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void kernel(const T * d_input, T * d_output)
{
Runner::template run<T, BlockSize, ItemsPerThread, WithTile, Trials>(d_input, d_output);
Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark_block_exchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(d_input, d_ranks, d_output);
Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark_block_histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, BinSize, Trials>(input, output);
Expand Down
4 changes: 2 additions & 2 deletions benchmark/benchmark_block_radix_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void sort_keys_kernel(const T * input, T * output)
{
const unsigned int lid = hipThreadIdx_x;
Expand All @@ -94,7 +94,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void sort_pairs_kernel(const T * input, T * output)
{
const unsigned int lid = hipThreadIdx_x;
Expand Down
8 changes: 7 additions & 1 deletion benchmark/benchmark_block_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
Expand Down Expand Up @@ -257,6 +257,12 @@ int main(int argc, char *argv[])
add_benchmarks<reduce_rr_t>(
benchmarks, "reduce", "raking_reduce", stream, size
);
// reduce commutative only
using reduce_rrco_t = reduce<rocprim::block_reduce_algorithm::raking_reduce_commutative_only>;
add_benchmarks<reduce_rrco_t>(
benchmarks, "reduce", "raking_reduce_commutative_only", stream, size
);


// Use manual timing
for(auto& b : benchmarks)
Expand Down
2 changes: 1 addition & 1 deletion benchmark/benchmark_block_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void kernel(const T* input, T* output)
{
Runner::template run<T, BlockSize, ItemsPerThread, Trials>(input, output);
Expand Down
4 changes: 2 additions & 2 deletions benchmark/benchmark_block_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void sort_keys_kernel(const T * input, T * output)
{
const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
Expand All @@ -90,7 +90,7 @@ template<
unsigned int Trials
>
__global__
__launch_bounds__(BlockSize, ROCPRIM_DEFAULT_MIN_WARPS_PER_EU)
__launch_bounds__(BlockSize)
void sort_pairs_kernel(const T * input, T * output)
{
const unsigned int index = (hipBlockIdx_x * BlockSize) + hipThreadIdx_x;
Expand Down
Loading

0 comments on commit d853622

Please sign in to comment.