alpaka-group · bernhardmgruber · Mar 29, 2021 · Mar 14, 2021 · Mar 14, 2021 · Mar 15, 2021
diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers
  *
  * This file is part of Alpaka.
  *
@@ -49,6 +49,11 @@ namespace alpaka
             template<typename TWarp, typename TSfinae = void>
             struct Ballot;
 
+            //#############################################################################
+            //! The shfl warp swizzling trait.
+            template<typename TWarp, typename TSfinae = void>
+            struct Shfl;
+
             //#############################################################################
             //! The active mask trait.
             template<typename TWarp, typename TSfinae = void>
@@ -150,5 +155,50 @@ namespace alpaka
             using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
             return traits::Ballot<ImplementationBase>::ballot(warp, predicate);
         }
+
+        //-----------------------------------------------------------------------------
+        //! Exchange data between threads within a warp.
+        //!
+        //! Effectively executes:
+        //!
+        //!     __shared__ int32_t values[warpsize];
+        //!     values[threadIdx.x] = value;
+        //!     __syncthreads();
+        //!     return values[(srcLane + width*floor(threadIdx.x/width))%width];
+        //!
+        //! However, it does not use shared memory.
+        //!
+        //! Notes:
+        //! * The programmer must ensure that all threads calling this
+        //!   function (and the srcLane) are executing the same line of code.
+        //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+        //!
+        //! * Commonly used with width = warpsize (the default), (returns values[srcLane])
+        //!
+        //! * Width must be a power of 2.
+        //!
+        //! \tparam TWarp   warp implementation type
+        //! \param  warp    warp implementation
+        //! \param  value   value to broadcast (only meaningful from threadIdx == srcLane)
+        //! \param  srcLane source lane sending value
+        //! \param  width   number of threads receiving a single value
+        //! \return val from the thread index srcLane.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
+        }
+
+        //-----------------------------------------------------------------------------
+        //! shfl for float vals
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TWarp>
+        ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0)
+        {
+            using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+            return traits::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
+        }
     } // namespace warp
 } // namespace alpaka
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers
  *
  * This file is part of Alpaka.
  *
@@ -92,6 +92,30 @@ namespace alpaka
                     return predicate ? 1u : 0u;
                 }
             };
+
+            //#################################################################
+            template<>
+            struct Shfl<WarpSingleThread>
+            {
+                //-------------------------------------------------------------
+                static auto shfl(
+                    warp::WarpSingleThread const& /*warp*/,
+                    std::int32_t val,
+                    std::int32_t /*srcLane*/,
+                    std::int32_t /*width*/)
+                {
+                    return val;
+                }
+                //-------------------------------------------------------------
+                static auto shfl(
+                    warp::WarpSingleThread const& /*warp*/,
+                    float val,
+                    std::int32_t /*srcLane*/,
+                    std::int32_t /*width*/)
+                {
+                    return val;
+                }
+            };
         } // namespace traits
     } // namespace warp
 } // namespace alpaka
diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2020 Sergei Bastrakov
+/* Copyright 2020-2021 Sergei Bastrakov, David M. Rogers
  *
  * This file is part of Alpaka.
  *
@@ -142,6 +142,38 @@ namespace alpaka
 #    else
                     ignore_unused(warp);
                     return __ballot(predicate);
+#    endif
+                }
+            };
+
+            //#################################################################
+            template<>
+            struct Shfl<WarpUniformCudaHipBuiltIn>
+            {
+                //-------------------------------------------------------------
+                __device__ static auto shfl(
+                    warp::WarpUniformCudaHipBuiltIn const& warp,
+                    float val,
+                    int srcLane,
+                    std::int32_t width) -> float
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __shfl_sync(activemask(warp), val, srcLane, width);
+#    else
+                    return __shfl(val, srcLane, width);
+#    endif
+                }
+                //-------------------------------------------------------------
+                __device__ static auto shfl(
+                    warp::WarpUniformCudaHipBuiltIn const& warp,
+                    std::int32_t val,
+                    int srcLane,
+                    std::int32_t width) -> std::int32_t
+                {
+#    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                    return __shfl_sync(activemask(warp), val, srcLane, width);
+#    else
+                    return __shfl(val, srcLane, width);
 #    endif
                 }
             };

diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp
@@ -0,0 +1,130 @@
+/* Copyright 2021 David M. Rogers
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch.hpp>
+
+#include <cstdint>
+#include <limits>
+
+//#############################################################################
+class ShflSingleThreadWarpTestKernel
+{
+public:
+    //-------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        ALPAKA_CHECK(*success, warpExtent == 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 12, 0) == 12);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, -1) == 42);
+        float ans = alpaka::warp::shfl(acc, 3.3f, 0);
+        ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - 3.3f) < 1e-8f);
+    }
+};
+
+//#############################################################################
+class ShflMultipleThreadWarpTestKernel
+{
+public:
+    //-----------------------------------------------------------------------------
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        // Test relies on having a single warp per thread block
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 42, 0) == 42);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 0) == 0);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, 1) == 1);
+        // Note the CUDA and HIP API-s differ on lane wrapping, but both agree it should not segfault
+        // https://github.com/ROCm-Developer-Tools/HIP-CPU/issues/14
+        ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, 5, -1) == 5);
+
+        auto const epsilon = std::numeric_limits<float>::epsilon();
+
+        // Test various widths
+        for(int width = 1; width < warpExtent; width *= 2)
+        {
+            for(int idx = 0; idx < width; idx++)
+            {
+                int const off = width * (threadIdxInWarp / width);
+                ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx, width) == idx + off);
+                float const ans = alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx, width);
+                float const expect = 4.0f - float(idx + off);
+                ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+            }
+        }
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(int idx = 0; idx < warpExtent / 2; idx++)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl(acc, threadIdxInWarp, idx) == idx);
+            float const ans = alpaka::warp::shfl(acc, 4.0f - float(threadIdxInWarp), idx);
+            float const expect = 4.0f - float(idx);
+            ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+        }
+    }
+};
+
+//-----------------------------------------------------------------------------
+TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Pltf = alpaka::Pltf<Dev>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    Dev const dev(alpaka::getDevByIdx<Pltf>(0u));
+    auto const warpExtent = alpaka::getWarpSize(dev);
+    if(warpExtent == 1)
+    {
+        Idx const gridThreadExtentPerDim = 4;
+        alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(gridThreadExtentPerDim));
+        ShflSingleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+    }
+    else
+    {
+        // Work around gcc 7.5 trying and failing to offload for OpenMP 4.0
+#if BOOST_COMP_GNUC && (BOOST_COMP_GNUC == BOOST_VERSION_NUMBER(7, 5, 0)) && defined ALPAKA_ACC_ANY_BT_OMP5_ENABLED
+        return;
+#else
+        using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+        auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+        // Enforce one warp per thread block
+        auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+        blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+        auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+        auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+        auto fixture = ExecutionFixture{workDiv};
+        ShflMultipleThreadWarpTestKernel kernel;
+        REQUIRE(fixture(kernel));
+#endif
+    }
+}