diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp index a7b8354f2f54..35204c118ecd 100644 --- a/include/alpaka/warp/Traits.hpp +++ b/include/alpaka/warp/Traits.hpp @@ -1,4 +1,4 @@ -/* Copyright 2020 Sergei Bastrakov +/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers * * This file is part of Alpaka. * @@ -49,6 +49,11 @@ namespace alpaka template struct Ballot; + //############################################################################# + //! The shfl warp swizzling trait. + template + struct Shfl; + //############################################################################# //! The active mask trait. template @@ -150,5 +155,50 @@ namespace alpaka using ImplementationBase = concepts::ImplementationBase; return traits::Ballot::ballot(warp, predicate); } + + //----------------------------------------------------------------------------- + //! Exchange data between threads within a warp. + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return values[(srcLane + width*floor(threadIdx.x/width))%width]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[srcLane]) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \param warp warp implementation + //! \param value value to broadcast (only meaningful from threadIdx == srcLane) + //! \param srcLane source lane sending value + //! \param width number of threads receiving a single value + //! \return val from the thread index srcLane. + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); + } + + //----------------------------------------------------------------------------- + //! shfl for float vals + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return traits::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); + } } // namespace warp } // namespace alpaka diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp index 1451aa0de7f8..0d29b594010c 100644 --- a/include/alpaka/warp/WarpSingleThread.hpp +++ b/include/alpaka/warp/WarpSingleThread.hpp @@ -1,4 +1,4 @@ -/* Copyright 2020 Sergei Bastrakov +/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers * * This file is part of Alpaka. * @@ -92,6 +92,30 @@ namespace alpaka return predicate ? 1u : 0u; } }; + + //################################################################# + template<> + struct Shfl + { + //------------------------------------------------------------- + static auto shfl( + warp::WarpSingleThread const& /*warp*/, + std::int32_t val, + std::int32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + //------------------------------------------------------------- + static auto shfl( + warp::WarpSingleThread const& /*warp*/, + float val, + std::int32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + }; } // namespace traits } // namespace warp } // namespace alpaka diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp index 09d958444455..7d8709b91c39 100644 --- a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp +++ b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp @@ -1,4 +1,4 @@ -/* Copyright 2020 Sergei Bastrakov +/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers * * This file is part of Alpaka. * @@ -142,6 +142,38 @@ namespace alpaka # else ignore_unused(warp); return __ballot(predicate); +# endif + } + }; + + //################################################################# + template<> + struct Shfl + { + //------------------------------------------------------------- + __device__ static auto shfl( + warp::WarpUniformCudaHipBuiltIn const& warp, + float val, + int srcLane, + std::int32_t width) -> float + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_sync(activemask(warp), val, srcLane, width); +# else + return __shfl(val, srcLane, width); +# endif + } + //------------------------------------------------------------- + __device__ static auto shfl( + warp::WarpUniformCudaHipBuiltIn const& warp, + std::int32_t val, + int srcLane, + std::int32_t width) -> std::int32_t + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_sync(activemask(warp), val, srcLane, width); +# else + return __shfl(val, srcLane, width); # endif } }; diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp new file mode 100644 index 000000000000..0029dc9cda0b --- /dev/null +++ b/test/unit/warp/src/Shfl.cpp @@ -0,0 +1,130 @@ +/* Copyright 2021 David M. Rogers + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include + +#include + +#include +#include + +//############################################################################# +class ShflSingleThreadWarpTestKernel +{ +public: + //------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + ALPAKA_CHECK(*success, warpExtent == 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 12, 0) == 12); + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, -1) == 42); + float ans = alpaka::warp::shfl(acc, 3.3f, 0); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - 3.3f) < 1e-8f); + } +}; + +//############################################################################# +class ShflMultipleThreadWarpTestKernel +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void + { + auto const localThreadIdx = alpaka::getIdx(acc); + auto const blockExtent = alpaka::getWorkDiv(acc); + std::int32_t const warpExtent = alpaka::warp::getSize(acc); + // Test relies on having a single warp per thread block + ALPAKA_CHECK(*success, static_cast(blockExtent.prod()) == warpExtent); + auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]); + + ALPAKA_CHECK(*success, warpExtent > 1); + + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, 0) == 42); + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 0) == 0); + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 1) == 1); + // Note the CUDA and HIP API-s differ on lane wrapping, but both agree it should not segfault + // https://github.com/ROCm-Developer-Tools/HIP-CPU/issues/14 + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 5, -1) == 5); + + auto const epsilon = std::numeric_limits::epsilon(); + + // Test various widths + for(int width = 1; width < warpExtent; width *= 2) + { + for(int idx = 0; idx < width; idx++) + { + int const off = width * (threadIdxInWarp / width); + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx, width) == idx + off); + float const ans = alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx, width); + float const expect = 4.0f - float(idx + off); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } + + // Some threads quit the kernel to test that the warp operations + // properly operate on the active threads only + if(threadIdxInWarp >= warpExtent / 2) + return; + + for(int idx = 0; idx < warpExtent / 2; idx++) + { + ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx) == idx); + float const ans = alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx); + float const expect = 4.0f - float(idx); + ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon); + } + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dev = alpaka::Dev; + using Pltf = alpaka::Pltf; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + Dev const dev(alpaka::getDevByIdx(0u)); + auto const warpExtent = alpaka::getWarpSize(dev); + if(warpExtent == 1) + { + Idx const gridThreadExtentPerDim = 4; + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(gridThreadExtentPerDim)); + ShflSingleThreadWarpTestKernel kernel; + REQUIRE(fixture(kernel)); + } + else + { + // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0 +#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED + return; +#else + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + ShflMultipleThreadWarpTestKernel kernel; + REQUIRE(fixture(kernel)); +#endif + } +}