diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp index 15b66b14619c..0671e89ebeb9 100644 --- a/include/alpaka/warp/Traits.hpp +++ b/include/alpaka/warp/Traits.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -39,6 +39,18 @@ namespace alpaka::warp template struct Shfl; + //! The shfl up warp swizzling trait. + template + struct ShflUp; + + //! The shfl down warp swizzling trait. + template + struct ShflDown; + + //! The shfl xor warp swizzling trait. + template + struct ShflXor; + //! The active mask trait. template struct Activemask; @@ -162,7 +174,7 @@ namespace alpaka::warp //! __shared__ int32_t values[warpsize]; //! values[threadIdx.x] = value; //! __syncthreads(); - //! return values[(srcLane + width*floor(threadIdx.x/width))%width]; + //! return values[width*(threadIdx.x/width) + srcLane%width]; //! //! However, it does not use shared memory. //! @@ -182,19 +194,123 @@ namespace alpaka::warp //! \param width number of threads receiving a single value //! \return val from the thread index srcLane. ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0) + template + ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0) { using ImplementationBase = concepts::ImplementationBase; return trait::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); } - //! shfl for float vals + //! Exchange data between threads within a warp. + //! It copies from a lane with lower ID relative to caller. + //! The lane ID is calculated by subtracting delta from the caller’s lane ID. + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return values[width*(threadIdx.x/width) + threadIdx.x%width - delta]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >= + //! delta) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param offset corresponds to the delta used to compute the lane ID + //! \param width number of threads receiving a single value + //! \return val from the thread index lane ID. ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0) + template + ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0) { using ImplementationBase = concepts::ImplementationBase; - return trait::Shfl::shfl(warp, value, srcLane, width ? width : getSize(warp)); + return trait::ShflUp::shfl_up(warp, value, offset, width ? width : getSize(warp)); + } + + //! Exchange data between threads within a warp. + //! It copies from a lane with higher ID relative to caller. + //! The lane ID is calculated by adding delta from the caller’s lane ID. + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return values[width*(threadIdx.x/width) + threadIdx.x%width + delta]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta < + //! warpsize) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param offset corresponds to the delta used to compute the lane ID + //! \param width number of threads receiving a single value + //! \return val from the thread index lane ID. + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return trait::ShflDown::shfl_down(warp, value, offset, width ? width : getSize(warp)); + } + + //! Exchange data between threads within a warp. + //! It copies from a lane based on bitwise XOR of own lane ID. + //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with delta + //! + //! Effectively executes: + //! + //! __shared__ int32_t values[warpsize]; + //! values[threadIdx.x] = value; + //! __syncthreads(); + //! return values[width*(threadIdx.x/width) + (threadIdx.x%width ^ delta)]; + //! + //! However, it does not use shared memory. + //! + //! Notes: + //! * The programmer must ensure that all threads calling this + //! function (and the srcLane) are executing the same line of code. + //! In particular it is not portable to write if(a) {shfl} else {shfl}. + //! + //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^delta]) + //! + //! * Width must be a power of 2. + //! + //! \tparam TWarp warp implementation type + //! \tparam T value type + //! \param warp warp implementation + //! \param value value to broadcast + //! \param offset corresponds to the delta used to compute the lane ID + //! \param width number of threads receiving a single value + //! \return val from the thread index lane ID. + ALPAKA_NO_HOST_ACC_WARNING + template + ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t offset, std::int32_t width = 0) + { + using ImplementationBase = concepts::ImplementationBase; + return trait::ShflXor::shfl_xor(warp, value, offset, width ? width : getSize(warp)); } } // namespace alpaka::warp diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index e420b615585b..7f7b838edf96 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -114,7 +114,6 @@ namespace alpaka::warp::trait static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) { ALPAKA_ASSERT_OFFLOAD(width > 0); - ALPAKA_ASSERT_OFFLOAD(srcLane < width); ALPAKA_ASSERT_OFFLOAD(srcLane >= 0); /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each @@ -132,6 +131,68 @@ namespace alpaka::warp::trait return sycl::select_from_group(actual_group, value, src); } }; + + template + struct ShflUp> + { + template + static auto shfl_up( + warp::WarpGenericSycl const& warp, + T value, + std::uint32_t offset, /* must be the same for all work-items in the group */ + std::int32_t width) + { + std::int32_t offset_int = static_cast(offset); + auto const actual_group = warp.m_item_warp.get_sub_group(); + auto actual_item_id = static_cast(actual_group.get_local_linear_id()); + auto const actual_group_id = actual_item_id / width; + auto const actual_src_id = actual_item_id - offset_int; + auto const src = actual_src_id >= actual_group_id * width + ? sycl::id<1>{static_cast(actual_src_id)} + : sycl::id<1>{static_cast(actual_item_id)}; + return sycl::select_from_group(actual_group, value, src); + } + }; + + template + struct ShflDown> + { + template + static auto shfl_down( + warp::WarpGenericSycl const& warp, + T value, + std::uint32_t offset, + std::int32_t width) + { + std::int32_t offset_int = static_cast(offset); + auto const actual_group = warp.m_item_warp.get_sub_group(); + auto actual_item_id = static_cast(actual_group.get_local_linear_id()); + auto const actual_group_id = actual_item_id / width; + auto const actual_src_id = actual_item_id + offset_int; + auto const src = actual_src_id < (actual_group_id + 1) * width + ? sycl::id<1>{static_cast(actual_src_id)} + : sycl::id<1>{static_cast(actual_item_id)}; + return sycl::select_from_group(actual_group, value, src); + } + }; + + template + struct ShflXor> + { + template + static auto shfl_xor( + warp::WarpGenericSycl const& warp, + T value, + std::int32_t offset, + std::int32_t /*width*/) + { + auto const actual_group = warp.m_item_warp.get_sub_group(); + auto actual_item_id = static_cast(actual_group.get_local_linear_id()); + auto const actual_src_id = actual_item_id ^ offset; + auto const src = sycl::id<1>{static_cast(actual_src_id)}; + return sycl::select_from_group(actual_group, value, src); + } + }; } // namespace alpaka::warp::trait #endif diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp index 950dd8286161..d2713032551d 100644 --- a/include/alpaka/warp/WarpSingleThread.hpp +++ b/include/alpaka/warp/WarpSingleThread.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -65,18 +65,52 @@ namespace alpaka::warp template<> struct Shfl { + template static auto shfl( warp::WarpSingleThread const& /*warp*/, - std::int32_t val, + T val, std::int32_t /*srcLane*/, std::int32_t /*width*/) { return val; } + }; - static auto shfl( + template<> + struct ShflUp + { + template + static auto shfl_up( + warp::WarpSingleThread const& /*warp*/, + T val, + std::uint32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + }; + + template<> + struct ShflDown + { + template + static auto shfl_down( + warp::WarpSingleThread const& /*warp*/, + T val, + std::uint32_t /*srcLane*/, + std::int32_t /*width*/) + { + return val; + } + }; + + template<> + struct ShflXor + { + template + static auto shfl_xor( warp::WarpSingleThread const& /*warp*/, - float val, + T val, std::int32_t /*srcLane*/, std::int32_t /*width*/) { diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp index 1e7d98d82d7d..ec0785c6999b 100644 --- a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp +++ b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber +/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -113,12 +113,12 @@ namespace alpaka::warp template<> struct Shfl { - //------------------------------------------------------------- + template __device__ static auto shfl( [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, - float val, + T val, int srcLane, - std::int32_t width) -> float + std::int32_t width) -> T { # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) return __shfl_sync(activemask(warp), val, srcLane, width); @@ -126,21 +126,62 @@ namespace alpaka::warp return __shfl(val, srcLane, width); # endif } + }; - //------------------------------------------------------------- - __device__ static auto shfl( + template<> + struct ShflUp + { + template + __device__ static auto shfl_up( [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, - std::int32_t val, - int srcLane, - std::int32_t width) -> std::int32_t + T val, + std::uint32_t offset, + std::int32_t width) -> T { # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) - return __shfl_sync(activemask(warp), val, srcLane, width); + return __shfl_up_sync(activemask(warp), val, offset, width); # else - return __shfl(val, srcLane, width); + return __shfl_up(val, offset, width); +# endif + } + }; + + template<> + struct ShflDown + { + template + __device__ static auto shfl_down( + [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, + T val, + std::uint32_t offset, + std::int32_t width) -> T + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_down_sync(activemask(warp), val, offset, width); +# else + return __shfl_down(val, offset, width); +# endif + } + }; + + template<> + struct ShflXor + { + template + __device__ static auto shfl_xor( + [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp, + T val, + std::int32_t offset, + std::int32_t width) -> T + { +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) + return __shfl_xor_sync(activemask(warp), val, offset, width); +# else + return __shfl_xor(val, offset, width); # endif } }; + } // namespace trait # endif } // namespace alpaka::warp