diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
index 15b66b14619c..f4cfb4d6bdc6 100644
--- a/include/alpaka/warp/Traits.hpp
+++ b/include/alpaka/warp/Traits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -39,6 +39,18 @@ namespace alpaka::warp
         template<typename TWarp, typename TSfinae = void>
         struct Shfl;
 
+        //! The shfl up warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflUp;
+
+        //! The shfl down warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflDown;
+
+        //! The shfl xor warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflXor;
+
         //! The active mask trait.
         template<typename TWarp, typename TSfinae = void>
         struct Activemask;
@@ -162,7 +174,7 @@ namespace alpaka::warp
     //!     __shared__ int32_t values[warpsize];
     //!     values[threadIdx.x] = value;
     //!     __syncthreads();
-    //!     return values[(srcLane + width*floor(threadIdx.x/width))%width];
+    //!     return values[width*(threadIdx.x/width) + srcLane%width];
     //!
     //! However, it does not use shared memory.
     //!
@@ -182,19 +194,124 @@ namespace alpaka::warp
     //! \param  width   number of threads receiving a single value
     //! \return val from the thread index srcLane.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0)
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
     {
         using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
         return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
     }
 
-    //! shfl for float vals
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with lower ID relative to caller.
+    //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return (threadIdx.x % width >= delta) ? values[threadIdx.x - delta] : values[threadIdx.x];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
+    //! delta)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0)
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
     {
         using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
+        return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with higher ID relative to caller.
+    //! The lane ID is calculated by adding delta to the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return (threadIdx.x % width + delta < width) ? values[threadIdx.x + delta] : values[threadIdx.x];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
+    //! warpsize)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane based on bitwise XOR of own lane ID.
+    //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with mask
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     int lane = threadIdx.x ^ mask;
+    //!     return values[lane / width > threadIdx.x / width ? threadIdx.x : lane];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^mask])
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  mask    corresponds to the mask used to compute the lane ID
+    //! \param  width   size of the group participating in the shuffle operation
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t mask, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, mask, width ? width : getSize(warp));
     }
 } // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index e420b615585b..425d97a25859 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -1,5 +1,11 @@
 /* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
+ *
+ * The implementations of Shfl::shfl(), ShflUp::shfl_up(), ShflDown::shfl_down() and ShflXor::shfl_xor() are derived
+ * from Intel DPCT.
+ * Copyright (C) Intel Corporation.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ * See https://llvm.org/LICENSE.txt for license information.
  */
 
 #pragma once
@@ -114,7 +120,6 @@ namespace alpaka::warp::trait
         static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
         {
             ALPAKA_ASSERT_OFFLOAD(width > 0);
-            ALPAKA_ASSERT_OFFLOAD(srcLane < width);
             ALPAKA_ASSERT_OFFLOAD(srcLane >= 0);
 
             /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
@@ -124,12 +129,70 @@ namespace alpaka::warp::trait
                The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
                first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
             auto const actual_group = warp.m_item_warp.get_sub_group();
-            auto const actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
-            auto const actual_group_id = actual_item_id / width;
-            auto const actual_src_id = static_cast<std::size_t>(srcLane + actual_group_id * width);
-            auto const src = sycl::id<1>{actual_src_id};
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const start_index = actual_group.get_local_linear_id() / w * w;
+            return sycl::select_from_group(actual_group, value, start_index + static_cast<std::uint32_t>(srcLane) % w);
+        }
+    };
+
+    template<typename TDim>
+    struct ShflUp<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_up(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset, /* must be the same for all work-items in the group */
+            std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const start_index = id / w * w;
+            T result = sycl::shift_group_right(actual_group, value, offset);
+            if((id - start_index) < offset)
+            {
+                result = value;
+            }
+            return result;
+        }
+    };
 
-            return sycl::select_from_group(actual_group, value, src);
+    template<typename TDim>
+    struct ShflDown<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_down(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset,
+            std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const end_index = (id / w + 1) * w;
+            T result = sycl::shift_group_left(actual_group, value, offset);
+            if((id + offset) >= end_index)
+            {
+                result = value;
+            }
+            return result;
+        }
+    };
+
+    template<typename TDim>
+    struct ShflXor<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_xor(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t mask, std::int32_t width)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            std::uint32_t const w = static_cast<std::uint32_t>(width);
+            std::uint32_t const id = actual_group.get_local_linear_id();
+            std::uint32_t const start_index = id / w * w;
+            std::uint32_t const target_offset = (id % w) ^ static_cast<std::uint32_t>(mask);
+            return sycl::select_from_group(actual_group, value, target_offset < w ? start_index + target_offset : id);
         }
     };
 } // namespace alpaka::warp::trait
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
index 950dd8286161..d2713032551d 100644
--- a/include/alpaka/warp/WarpSingleThread.hpp
+++ b/include/alpaka/warp/WarpSingleThread.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -65,18 +65,52 @@ namespace alpaka::warp
         template<>
         struct Shfl<WarpSingleThread>
         {
+            template<typename T>
             static auto shfl(
                 warp::WarpSingleThread const& /*warp*/,
-                std::int32_t val,
+                T val,
                 std::int32_t /*srcLane*/,
                 std::int32_t /*width*/)
             {
                 return val;
             }
+        };
 
-            static auto shfl(
+        template<>
+        struct ShflUp<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_up(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_down(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_xor(
                 warp::WarpSingleThread const& /*warp*/,
-                float val,
+                T val,
                 std::int32_t /*srcLane*/,
                 std::int32_t /*width*/)
             {
diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
index 1e7d98d82d7d..58b89d5f81d9 100644
--- a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
+++ b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -113,12 +113,12 @@ namespace alpaka::warp
         template<>
         struct Shfl<WarpUniformCudaHipBuiltIn>
         {
-            //-------------------------------------------------------------
+            template<typename T>
             __device__ static auto shfl(
                 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                float val,
+                T val,
                 int srcLane,
-                std::int32_t width) -> float
+                std::int32_t width) -> T
             {
 #        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
                 return __shfl_sync(activemask(warp), val, srcLane, width);
@@ -126,21 +126,62 @@ namespace alpaka::warp
                 return __shfl(val, srcLane, width);
 #        endif
             }
+        };
 
-            //-------------------------------------------------------------
-            __device__ static auto shfl(
+        template<>
+        struct ShflUp<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_up(
                 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                std::int32_t val,
-                int srcLane,
-                std::int32_t width) -> std::int32_t
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
             {
 #        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_sync(activemask(warp), val, srcLane, width);
+                return __shfl_up_sync(activemask(warp), val, offset, width);
 #        else
-                return __shfl(val, srcLane, width);
+                return __shfl_up(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_down(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_down_sync(activemask(warp), val, offset, width);
+#        else
+                return __shfl_down(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_xor(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::int32_t mask,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_xor_sync(activemask(warp), val, mask, width);
+#        else
+                return __shfl_xor(val, mask, width);
 #        endif
             }
         };
+
     } // namespace trait
 #    endif
 } // namespace alpaka::warp
diff --git a/test/unit/warp/src/ShflDown.cpp b/test/unit/warp/src/ShflDown.cpp
new file mode 100644
index 000000000000..3663dc239829
--- /dev/null
+++ b/test/unit/warp/src/ShflDown.cpp
@@ -0,0 +1,175 @@
+/* Copyright 2023 Aurora Perego
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/math/FloatEqualExact.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstdint>
+#include <limits>
+
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
+struct ShflDownSingleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        if constexpr(alpaka::Dim<TAcc>::value > 0)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1);
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42);
+        }
+        else
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0, 1) == 42);
+        }
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 12, 0) == 12);
+        float ans = alpaka::warp::shfl_down(acc, 3.3f, 0);
+        ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f));
+    }
+};
+
+template<std::uint32_t TWarpSize>
+struct ShflDownMultipleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        // Test relies on having a single warp per thread block
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, 42, 0) == 42);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_down(acc, threadIdxInWarp, 0) == threadIdxInWarp);
+        ALPAKA_CHECK(
+            *success,
+            alpaka::warp::shfl_down(acc, threadIdxInWarp, 1)
+                == (threadIdxInWarp + 1 < warpExtent ? threadIdxInWarp + 1 : threadIdxInWarp));
+        auto const epsilon = std::numeric_limits<float>::epsilon();
+
+        // Test various widths
+        for(int width = 1; width < warpExtent; width *= 2)
+        {
+            for(int idx = 0; idx < width; idx++)
+            {
+                int const off = width * (threadIdxInWarp / width);
+                ALPAKA_CHECK(
+                    *success,
+                    alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx), width)
+                        == ((threadIdxInWarp + idx < (width + off)) ? threadIdxInWarp + idx : threadIdxInWarp));
+                float const ans = alpaka::warp::shfl_down(
+                    acc,
+                    4.0f - float(threadIdxInWarp),
+                    static_cast<std::uint32_t>(idx),
+                    width);
+                float const expect
+                    = ((threadIdxInWarp + idx < (width + off)) ? (4.0f - float(threadIdxInWarp + idx))
+                                                               : (4.0f - float(threadIdxInWarp)));
+                ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+            }
+        }
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(int idx = 0; idx < warpExtent / 2; idx++)
+        {
+            auto const shfl = alpaka::warp::shfl_down(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx));
+            float const ans
+                = alpaka::warp::shfl_down(acc, 4.0f - float(threadIdxInWarp), static_cast<std::uint32_t>(idx));
+            float const expect
+                = ((threadIdxInWarp + idx < warpExtent / 2) ? (4.0f - float(threadIdxInWarp + idx)) : 0);
+            if(threadIdxInWarp + idx < warpExtent / 2)
+            {
+                ALPAKA_CHECK(*success, shfl == threadIdxInWarp + idx);
+                ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+            }
+        }
+    }
+};
+
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ShflDownMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
+TEMPLATE_LIST_TEST_CASE("shfl_down", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    auto const platform = alpaka::Platform<Acc>{};
+    Dev const dev(alpaka::getDevByIdx(platform, 0u));
+    auto const warpExtents = alpaka::getWarpSizes(dev);
+    for(auto const warpExtent : warpExtents)
+    {
+        auto const scalar = Dim::value == 0 || warpExtent == 1;
+        if(scalar)
+        {
+            alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
+            REQUIRE(fixture(ShflDownSingleThreadWarpTestKernel{}));
+        }
+        else
+        {
+            using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+            auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+            // Enforce one warp per thread block
+            auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+            blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+            auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+            auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+            auto fixture = ExecutionFixture{workDiv};
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(ShflDownMultipleThreadWarpTestKernel<64>{}));
+            }
+        }
+    }
+}
+
+#if BOOST_COMP_GNUC
+#    pragma GCC diagnostic pop
+#endif
diff --git a/test/unit/warp/src/ShflUp.cpp b/test/unit/warp/src/ShflUp.cpp
new file mode 100644
index 000000000000..c0aa8d0832fa
--- /dev/null
+++ b/test/unit/warp/src/ShflUp.cpp
@@ -0,0 +1,167 @@
+/* Copyright 2023 Aurora Perego
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/math/FloatEqualExact.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstdint>
+#include <limits>
+
+struct ShflUpSingleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        if constexpr(alpaka::Dim<TAcc>::value > 0)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1);
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42);
+        }
+        else
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0, 1) == 42);
+        }
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 12, 0) == 12);
+        float ans = alpaka::warp::shfl_up(acc, 3.3f, 0);
+        ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f));
+    }
+};
+
+template<std::uint32_t TWarpSize>
+struct ShflUpMultipleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        // Test relies on having a single warp per thread block
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, 42, 0) == 42);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_up(acc, threadIdxInWarp, 0) == threadIdxInWarp);
+        ALPAKA_CHECK(
+            *success,
+            alpaka::warp::shfl_up(acc, threadIdxInWarp, 1)
+                == (threadIdxInWarp - 1 >= 0 ? threadIdxInWarp - 1 : threadIdxInWarp));
+
+        auto const epsilon = std::numeric_limits<float>::epsilon();
+
+        // Test various widths
+        for(int width = 1; width < warpExtent; width *= 2)
+        {
+            for(int idx = 0; idx < width; idx++)
+            {
+                int const off = width * (threadIdxInWarp / width);
+                ALPAKA_CHECK(
+                    *success,
+                    alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx), width)
+                        == ((threadIdxInWarp - idx >= off) ? threadIdxInWarp - idx : threadIdxInWarp));
+                float const ans = alpaka::warp::shfl_up(
+                    acc,
+                    4.0f - float(threadIdxInWarp),
+                    static_cast<std::uint32_t>(idx),
+                    width);
+                float const expect
+                    = ((threadIdxInWarp - idx >= off) ? (4.0f - float(threadIdxInWarp - idx))
+                                                      : (4.0f - float(threadIdxInWarp)));
+                ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+            }
+        }
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(int idx = 0; idx < warpExtent / 2; idx++)
+        {
+            ALPAKA_CHECK(
+                *success,
+                alpaka::warp::shfl_up(acc, threadIdxInWarp, static_cast<std::uint32_t>(idx))
+                    == ((threadIdxInWarp - idx >= 0) ? (threadIdxInWarp - idx) : threadIdxInWarp));
+            float const ans
+                = alpaka::warp::shfl_up(acc, 4.0f - float(threadIdxInWarp), static_cast<std::uint32_t>(idx));
+            float const expect
+                = ((threadIdxInWarp - idx >= 0) ? (4.0f - float(threadIdxInWarp - idx))
+                                                : (4.0f - float(threadIdxInWarp)));
+            ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+        }
+    }
+};
+
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ShflUpMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
+TEMPLATE_LIST_TEST_CASE("shfl_up", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    auto const platform = alpaka::Platform<Acc>{};
+    Dev const dev(alpaka::getDevByIdx(platform, 0u));
+    auto const warpExtents = alpaka::getWarpSizes(dev);
+    for(auto const warpExtent : warpExtents)
+    {
+        auto const scalar = Dim::value == 0 || warpExtent == 1;
+        if(scalar)
+        {
+            alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
+            REQUIRE(fixture(ShflUpSingleThreadWarpTestKernel{}));
+        }
+        else
+        {
+            using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+            auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+            // Enforce one warp per thread block
+            auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+            blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+            auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+            auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+            auto fixture = ExecutionFixture{workDiv};
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(ShflUpMultipleThreadWarpTestKernel<64>{}));
+            }
+        }
+    }
+}
diff --git a/test/unit/warp/src/ShflXor.cpp b/test/unit/warp/src/ShflXor.cpp
new file mode 100644
index 000000000000..666f0d02a096
--- /dev/null
+++ b/test/unit/warp/src/ShflXor.cpp
@@ -0,0 +1,152 @@
+/* Copyright 2023 Aurora Perego
+ *
+ * This file is part of Alpaka.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/math/FloatEqualExact.hpp>
+#include <alpaka/test/KernelExecutionFixture.hpp>
+#include <alpaka/test/acc/TestAccs.hpp>
+#include <alpaka/test/queue/Queue.hpp>
+#include <alpaka/warp/Traits.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+struct ShflXorSingleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        if constexpr(alpaka::Dim<TAcc>::value > 0)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::getSize(acc) == 1);
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, -1) == 42);
+        }
+        else
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0, 1) == 42);
+        }
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 12, 0) == 12);
+        float ans = alpaka::warp::shfl_xor(acc, 3.3f, 0);
+        ALPAKA_CHECK(*success, alpaka::math::floatEqualExactNoWarning(ans, 3.3f));
+    }
+};
+
+template<std::uint32_t TWarpSize>
+struct ShflXorMultipleThreadWarpTestKernel
+{
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TAcc>
+    ALPAKA_FN_ACC auto operator()(TAcc const& acc, bool* success) const -> void
+    {
+        auto const localThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
+        auto const blockExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
+        std::int32_t const warpExtent = alpaka::warp::getSize(acc);
+        // Test relies on having a single warp per thread block
+        ALPAKA_CHECK(*success, static_cast<std::int32_t>(blockExtent.prod()) == warpExtent);
+        auto const threadIdxInWarp = std::int32_t(alpaka::mapIdx<1u>(localThreadIdx, blockExtent)[0]);
+
+        ALPAKA_CHECK(*success, warpExtent > 1);
+
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 42, 0) == 42);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 0) == threadIdxInWarp);
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, 1) == (threadIdxInWarp ^ 1));
+        ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, 5, -1) == 5);
+
+        auto const epsilon = std::numeric_limits<float>::epsilon();
+
+        // Test various widths
+        for(int width = 1; width < warpExtent; width *= 2)
+        {
+            for(int idx = 0; idx < width; idx++)
+            {
+                ALPAKA_CHECK(
+                    *success,
+                    alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx, width) == (threadIdxInWarp ^ idx));
+                float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx, width);
+                float const expect = 4.0f - float(threadIdxInWarp ^ idx);
+                ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+            }
+        }
+
+        // Some threads quit the kernel to test that the warp operations
+        // properly operate on the active threads only
+        if(threadIdxInWarp >= warpExtent / 2)
+            return;
+
+        for(int idx = 0; idx < warpExtent / 2; idx++)
+        {
+            ALPAKA_CHECK(*success, alpaka::warp::shfl_xor(acc, threadIdxInWarp, idx) == (threadIdxInWarp ^ idx));
+            float const ans = alpaka::warp::shfl_xor(acc, 4.0f - float(threadIdxInWarp), idx);
+            float const expect = 4.0f - float(threadIdxInWarp ^ idx);
+            ALPAKA_CHECK(*success, alpaka::math::abs(acc, ans - expect) < epsilon);
+        }
+    }
+};
+
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ShflXorMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
+TEMPLATE_LIST_TEST_CASE("shfl_xor", "[warp]", alpaka::test::TestAccs)
+{
+    using Acc = TestType;
+    using Dev = alpaka::Dev<Acc>;
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+
+    auto const platform = alpaka::Platform<Acc>{};
+    Dev const dev(alpaka::getDevByIdx(platform, 0u));
+    auto const warpExtents = alpaka::getWarpSizes(dev);
+    for(auto const warpExtent : warpExtents)
+    {
+        auto const scalar = Dim::value == 0 || warpExtent == 1;
+        if(scalar)
+        {
+            alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
+            REQUIRE(fixture(ShflXorSingleThreadWarpTestKernel{}));
+        }
+        else
+        {
+            using ExecutionFixture = alpaka::test::KernelExecutionFixture<Acc>;
+            auto const gridBlockExtent = alpaka::Vec<Dim, Idx>::all(2);
+            // Enforce one warp per thread block
+            auto blockThreadExtent = alpaka::Vec<Dim, Idx>::ones();
+            blockThreadExtent[0] = static_cast<Idx>(warpExtent);
+            auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
+            auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
+            auto fixture = ExecutionFixture{workDiv};
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(ShflXorMultipleThreadWarpTestKernel<64>{}));
+            }
+        }
+    }
+}