diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp index 15b66b14619c..f4cfb4d6bdc6 100644 --- a/include/alpaka/warp/Traits.hpp +++ b/include/alpaka/warp/Traits.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -39,6 +39,18 @@ namespace alpaka::warp template struct Shfl; + //! The shfl up warp swizzling trait. + template + struct ShflUp; + + //! The shfl down warp swizzling trait. + template + struct ShflDown; + + //! The shfl xor warp swizzling trait. + template + struct ShflXor; + //! The active mask trait. template struct Activemask; @@ -162,7 +174,7 @@ namespace alpaka::warp //! __shared__ int32_t values[warpsize]; //! values[threadIdx.x] = value; //! __syncthreads(); - //! return values[(srcLane + width*floor(threadIdx.x/width))%width]; + //! return values[width*(threadIdx.x/width) + srcLane%width]; //! //! However, it does not use shared memory. //! @@ -182,19 +194,124 @@ namespace alpaka::warp //! \param width number of threads receiving a single value //! \return val from the thread index srcLane. ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0) + template + ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0) { using ImplementationBase = concepts::ImplementationBase; return trait::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); } - //! shfl for float vals + //! Exchange data between threads within a warp. + //! It copies from a lane with lower ID relative to caller. + //! The lane ID is calculated by subtracting delta from the caller’s lane ID. + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >= + //! delta) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param offset corresponds to the delta used to compute the lane ID + //! \param width size of the group participating in the shuffle operation + //! \return val from the thread index lane ID. ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0) + template + ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0) { using ImplementationBase = concepts::ImplementationBase; - return trait::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); + return trait::ShflUp::shfl_up(warp, value, offset, width ? width : getSize(warp)); + } + + //! Exchange data between threads within a warp. + //! It copies from a lane with higher ID relative to caller. + //! The lane ID is calculated by adding delta to the caller’s lane ID. + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta < + //! warpsize) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param offset corresponds to the delta used to compute the lane ID + //! \param width size of the group participating in the shuffle operation + //! \return val from the thread index lane ID. + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return trait::ShflDown::shfl_down(warp, value, offset, width ? width : getSize(warp)); + } + + //! Exchange data between threads within a warp. + //! It copies from a lane based on bitwise XOR of own lane ID. + //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! int lane = threadIdx.x ^ mask; + //! return values[lane / width > threadIdx.x / width ? threadIdx.x : lane]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask]) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param mask corresponds to the mask used to compute the lane ID + //! \param width size of the group participating in the shuffle operation + //! \return val from the thread index lane ID. + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return trait::ShflXor::shfl_xor(warp, value, mask, width ? width : getSize(warp)); } } // namespace alpaka::warp diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index e420b615585b..425d97a25859 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -1,5 +1,11 @@ /* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 + * + * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived + * from Intel DPCT. + * Copyright (C) Intel Corporation. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * See https://llvm.org/LICENSE.txt for license information. */ #pragma once @@ -114,7 +120,6 @@ namespace alpaka::warp::trait static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) { ALPAKA_ASSERT_OFFLOAD(width > 0); - ALPAKA_ASSERT_OFFLOAD(srcLane < width); ALPAKA_ASSERT_OFFLOAD(srcLane >= 0); /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each @@ -124,12 +129,70 @@ namespace alpaka::warp::trait The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */ auto const actual_group = warp.m_item_warp.get_sub_group(); - auto const actual_item_id = static_cast(actual_group.get_local_linear_id()); - auto const actual_group_id = actual_item_id / width; - auto const actual_src_id = static_cast(srcLane + actual_group_id * width); - auto const src = sycl::id<1>{actual_src_id}; + std::uint32_t const w = static_cast(width); + std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w; + return sycl::select_from_group(actual_group, value, start_index + static_cast(srcLane) % w); + } + }; + + template + struct ShflUp> + { + template + static auto shfl_up( + warp::WarpGenericSycl const& warp, + T value, + std::uint32_t offset, /* must be the same for all work-items in the group */ + std::int32_t width) + { + auto const actual_group = warp.m_item_warp.get_sub_group(); + std::uint32_t const w = static_cast(width); + std::uint32_t const id = actual_group.get_local_linear_id(); + std::uint32_t const start_index = id / w * w; + T result = sycl::shift_group_right(actual_group, value, offset); + if((id - start_index) < offset) + { + result = value; + } + return result; + } + }; - return sycl::select_from_group(actual_group, value, src); + template + struct ShflDown> + { + template + static auto shfl_down( + warp::WarpGenericSycl const& warp, + T value, + std::uint32_t offset, + std::int32_t width) + { + auto const actual_group = warp.m_item_warp.get_sub_group(); + std::uint32_t const w = static_cast(width); + std::uint32_t const id = actual_group.get_local_linear_id(); + std::uint32_t const end_index = (id / w + 1) * w; + T result = sycl::shift_group_left(actual_group, value, offset); + if((id + offset) >= end_index) + { + result = value; + } + return result; + } + }; + + template + struct ShflXor> + { + template + static auto shfl_xor(warp::WarpGenericSycl const& warp, T value, std::int32_t mask, std::int32_t width) + { + auto const actual_group = warp.m_item_warp.get_sub_group(); + std::uint32_t const w = static_cast(width); + std::uint32_t const id = actual_group.get_local_linear_id(); + std::uint32_t const start_index = id / w * w; + std::uint32_t const target_offset = (id % w) ^ static_cast(mask); + return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id); } }; } // namespace alpaka::warp::trait diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp index 950dd8286161..d2713032551d 100644 --- a/include/alpaka/warp/WarpSingleThread.hpp +++ b/include/alpaka/warp/WarpSingleThread.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -65,18 +65,52 @@ namespace alpaka::warp template<> struct Shfl { + template static auto shfl( warp::WarpSingleThread const& /*warp*/, - std::int32_t val, + T val, std::int32_t /*srcLane*/, std::int32_t /*width*/) { return val; } + }; - static auto shfl( + template<> + struct ShflUp + { + template + static auto shfl_up( + warp::WarpSingleThread const& /*warp*/, + T val, + std::uint32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + }; + + template<> + struct ShflDown + { + template + static auto shfl_down( + warp::WarpSingleThread const& /*warp*/, + T val, + std::uint32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + }; + + template<> + struct ShflXor + { + template + static auto shfl_xor( warp::WarpSingleThread const& /*warp*/, - float val, + T val, std::int32_t /*srcLane*/, std::int32_t /*width*/) { diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp index 1e7d98d82d7d..58b89d5f81d9 100644 --- a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp +++ b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -113,12 +113,12 @@ namespace alpaka::warp template<> struct Shfl { - //------------------------------------------------------------- + template __device__ static auto shfl( [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, - float val, + T val, int srcLane, - std::int32_t width) -> float + std::int32_t width) -> T { # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) return __shfl_sync(activemask(warp), val, srcLane, width); @@ -126,21 +126,62 @@ namespace alpaka::warp return __shfl(val, srcLane, width); # endif } + }; - //------------------------------------------------------------- - __device__ static auto shfl( + template<> + struct ShflUp + { + template + __device__ static auto shfl_up( [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, - std::int32_t val, - int srcLane, - std::int32_t width) -> std::int32_t + T val, + std::uint32_t offset, + std::int32_t width) -> T { # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - return __shfl_sync(activemask(warp), val, srcLane, width); + return __shfl_up_sync(activemask(warp), val, offset, width); # else - return __shfl(val, srcLane, width); + return __shfl_up(val, offset, width); +# endif + } + }; + + template<> + struct ShflDown + { + template + __device__ static auto shfl_down( + [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, + T val, + std::uint32_t offset, + std::int32_t width) -> T + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_down_sync(activemask(warp), val, offset, width); +# else + return __shfl_down(val, offset, width); +# endif + } + }; + + template<> + struct ShflXor + { + template + __device__ static auto shfl_xor( + [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, + T val, + std::int32_t mask, + std::int32_t width) -> T + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_xor_sync(activemask(warp), val, mask, width); +# else + return __shfl_xor(val, mask, width); # endif } }; + } // namespace trait # endif } // namespace alpaka::warp diff --git a/test/unit/warp/src/ShflDown.cpp b/test/unit/warp/src/ShflDown.cpp new file mode 100644 index 000000000000..3663dc239829 --- /dev/null +++ b/test/unit/warp/src/ShflDown.cpp @@ -0,0 +1,175 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#if BOOST_COMP_GNUC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstrict-overflow" +#endif + +struct ShflDownSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_down(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflDownMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_down(acc, threadIdxInWarp, 1) + == (threadIdxInWarp + 1 < warpExtent ? threadIdxInWarp + 1 : threadIdxInWarp)); + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + int const off = width * (threadIdxInWarp / width); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast(idx), width) + == ((threadIdxInWarp + idx < (width + off)) ? threadIdxInWarp + idx : threadIdxInWarp)); + float const ans = alpaka::warp::shfl_down( + acc, + 4.0f - float(threadIdxInWarp), + static_cast(idx), + width); + float const expect + = ((threadIdxInWarp + idx < (width + off)) ? (4.0f - float(threadIdxInWarp + idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + auto const shfl = alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast(idx)); + float const ans + = alpaka::warp::shfl_down(acc, 4.0f - float(threadIdxInWarp), static_cast(idx)); + float const expect + = ((threadIdxInWarp + idx < warpExtent / 2) ? (4.0f - float(threadIdxInWarp + idx)) : 0); + if(threadIdxInWarp + idx < warpExtent / 2) + { + ALPAKA_CHECK(*success, shfl == threadIdxInWarp + idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_down", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflDownSingleThreadWarpTestKernel{})); + } + else + { + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<64>{})); + } + } + } +} + +#if BOOST_COMP_GNUC +# pragma GCC diagnostic pop +#endif diff --git a/test/unit/warp/src/ShflUp.cpp b/test/unit/warp/src/ShflUp.cpp new file mode 100644 index 000000000000..c0aa8d0832fa --- /dev/null +++ b/test/unit/warp/src/ShflUp.cpp @@ -0,0 +1,167 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +struct ShflUpSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_up(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflUpMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, 1) + == (threadIdxInWarp - 1 >= 0 ? threadIdxInWarp - 1 : threadIdxInWarp)); + + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + int const off = width * (threadIdxInWarp / width); + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast(idx), width) + == ((threadIdxInWarp - idx >= off) ? threadIdxInWarp - idx : threadIdxInWarp)); + float const ans = alpaka::warp::shfl_up( + acc, + 4.0f - float(threadIdxInWarp), + static_cast(idx), + width); + float const expect + = ((threadIdxInWarp - idx >= off) ? (4.0f - float(threadIdxInWarp - idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast(idx)) + == ((threadIdxInWarp - idx >= 0) ? (threadIdxInWarp - idx) : threadIdxInWarp)); + float const ans + = alpaka::warp::shfl_up(acc, 4.0f - float(threadIdxInWarp), static_cast(idx)); + float const expect + = ((threadIdxInWarp - idx >= 0) ? (4.0f - float(threadIdxInWarp - idx)) + : (4.0f - float(threadIdxInWarp))); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_up", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflUpSingleThreadWarpTestKernel{})); + } + else + { + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<64>{})); + } + } + } +} diff --git a/test/unit/warp/src/ShflXor.cpp b/test/unit/warp/src/ShflXor.cpp new file mode 100644 index 000000000000..666f0d02a096 --- /dev/null +++ b/test/unit/warp/src/ShflXor.cpp @@ -0,0 +1,152 @@ +/* Copyright 2023 Aurora Perego + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +struct ShflXorSingleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + if constexpr(alpaka::Dim::value > 0) + { + ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, -1) == 42); + } + else + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0, 1) == 42); + } + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 12, 0) == 12); + float ans = alpaka::warp::shfl_xor(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f)); + } +}; + +template +struct ShflXorMultipleThreadWarpTestKernel +{ + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 0) == threadIdxInWarp); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 1) == (threadIdxInWarp ^ 1)); + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 5, -1) == 5); + + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + ALPAKA_CHECK( + *success, + alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx, width) == (threadIdxInWarp ^ idx)); + float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx, width); + float const expect = 4.0f - float(threadIdxInWarp ^ idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx) == (threadIdxInWarp ^ idx)); + float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx); + float const expect = 4.0f - float(threadIdxInWarp ^ idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } +}; + +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + +TEMPLATE_LIST_TEST_CASE("shfl_xor", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + Dev const dev(alpaka::getDevByIdx(platform, 0u)); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) + { + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) + { + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + REQUIRE(fixture(ShflXorSingleThreadWarpTestKernel{})); + } + else + { + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<64>{})); + } + } + } +}