diff --git a/README_SYCL.md b/README_SYCL.md
index eead8a8a1acc..ccac62293474 100644
--- a/README_SYCL.md
+++ b/README_SYCL.md
@@ -53,9 +53,9 @@ To enable device-side printing add the following compiler flags:
 * `-DALPAKA_SYCL_IOSTREAM_ENABLED`: to enable device-side printing.
 * `-DALPAKA_SYCL_IOSTREAM_KIB=<value>`: `<value>` (without the brackets) defines the kibibytes per block to be reserved for device-side printing. `<value>` cannot exceed the amount of shared memory per block.
 
-### Building for Intel CPUs
+### Building for x86 64-bit CPUs
 
-1. `#include <alpaka/standalone/CpuSyclIntel.hpp>` in your C++ code.
+1. `#include <alpaka/standalone/CpuSycl.hpp>` in your C++ code.
 2. Add the following flags:
   * `-fsycl-targets=spir64_x86_64` (compiler and linker): to enable CPU compilation. Note: If you are using multiple SYCL hardware targets (like CPU and GPU) separate them by comma here.
   * `-Xsycl-target-backend=spir64_x86_64 "-march=<ISA>"` (linker): to choose the Intel ISA to compile for. Check the output of `opencl-aot --help` and look for the possible values of the `--march` flag.
@@ -84,7 +84,7 @@ To enable device-side printing add the following compiler flags:
 
 In contrast to the other back-ends the SYCL back-end comes with multiple different accelerators which should be chosen according to your requirements:
 
-* `alpaka::experimental::AccCpuSyclIntel` for targeting Intel CPUs. In contrast to the other CPU back-ends this will be using Intel's OpenCL implementation for CPUs under the hood.
+* `alpaka::experimental::AccCpuSycl` for targeting Intel and AMD CPUs. In contrast to the other CPU back-ends this will use Intel's OpenCL implementation for CPUs under the hood.
 * `alpaka::experimental::AccFpgaSyclIntel` for targeting Intel FPGAs.
 * `alpaka::experimental::AccGpuSyclIntel` for targeting Intel GPUs. 
 
@@ -92,9 +92,13 @@ These can be used interchangeably (some restrictions apply - see below) with the
 
 ### Restrictions
 
-* The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation.
-* The SYCL back-end currently does not support passing pointers as kernel parameters. Use alpaka's experimental accessors instead.
-* The SYCL back-end does not have device-side random number generation.
+* The Intel FPGA back-end cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation.
 * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions.
 * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this.
-* The FPGA back-end does not support atomics. alpaka will not check this.
\ No newline at end of file
+  You can enable software emulation for `double` precision types with
+  ```bash
+  export IGC_EnableDPEmulation=1
+  export OverrideDefaultFP64Settings=1
+  ```
+  See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information.
+* The FPGA back-end does not support atomics. alpaka will not check this.
diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake
index e4ec50a8cd2b..27776497138d 100644
--- a/cmake/alpakaCommon.cmake
+++ b/cmake/alpakaCommon.cmake
@@ -1,5 +1,5 @@
 #
-# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber
+# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
 # SPDX-License-Identifier: MPL-2.0
 #
 
@@ -547,9 +547,6 @@ if(alpaka_ACC_SYCL_ENABLE)
 
         # Enable device-side printing to stdout
         cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
-        if(BUILD_TESTING)
-            set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
-        endif()
 
         alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl")
         target_link_options(alpaka INTERFACE "-fsycl")
@@ -559,7 +556,7 @@ if(alpaka_ACC_SYCL_ENABLE)
         # Determine SYCL targets
         set(alpaka_SYCL_ONEAPI_CPU_TARGET "spir64_x86_64")
         set(alpaka_SYCL_ONEAPI_FPGA_TARGET "spir64_fpga")
-        set(alpaka_SYCL_ONEAPI_GPU_TARGET "spir64_gen")
+        set(alpaka_SYCL_ONEAPI_GPU_TARGET ${alpaka_SYCL_ONEAPI_GPU_DEVICES})
 
         if(alpaka_SYCL_ONEAPI_CPU)
             list(APPEND alpaka_SYCL_TARGETS ${alpaka_SYCL_ONEAPI_CPU_TARGET})
@@ -620,14 +617,13 @@ if(alpaka_ACC_SYCL_ENABLE)
         if(alpaka_SYCL_ONEAPI_GPU)
             # Create a drop-down list (in cmake-gui) of valid Intel GPU targets. On the command line the user can specifiy
             # additional targets, such as ranges: "Gen8-Gen12LP" or lists: "icllp;skl".
-            set(alpaka_SYCL_ONEAPI_GPU_DEVICES "bdw" CACHE STRING "Intel GPU devices / generations to compile for")
+            set(alpaka_SYCL_ONEAPI_GPU_DEVICES "intel_gpu_pvc" CACHE STRING "Intel GPU devices / generations to compile for")
             set_property(CACHE alpaka_SYCL_ONEAPI_GPU_DEVICES
-                        PROPERTY STRINGS "bdw;skl;kbl;cfl;bxt;glk;whl;aml;cml;icllp;lkf;ehl;tgllp;rkl;adl-s;adl-p;dg1;acm-g10;ats-m150;dg2-g10;acm-g11;ats-m75;dg2-g11;acm-g12;dg2-g12;pvc-sdv;pvc;gen11;gen12lp;gen8;gen9;xe;xe-hpc;xe-hpg")
+                        PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw")
             # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler.
             string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}")
             
             target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU")
-            target_link_options(alpaka INTERFACE "SHELL:-Xsycl-target-backend=${alpaka_SYCL_ONEAPI_GPU_TARGET} \"-device ${alpaka_SYCL_ONEAPI_GPU_DEVICES}\"")
         endif()
 
         #-----------------------------------------------------------------------------------------------------------------
diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp
index e057e359e43d..a67362200189 100644
--- a/example/vectorAdd/src/vectorAdd.cpp
+++ b/example/vectorAdd/src/vectorAdd.cpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina,
+ *                Aurora Perego
  * SPDX-License-Identifier: ISC
  */
 
@@ -76,6 +77,8 @@ auto main() -> int
     // - AccCpuSerial
     // using Acc = alpaka::AccCpuSerial<Dim, Idx>;
     using Acc = alpaka::ExampleDefaultAcc<Dim, Idx>;
+    using Pltf = alpaka::Pltf<Acc>;
+    using DevAcc = alpaka::Dev<Acc>;
     std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;
 
     // Defines the synchronization behavior of a queue
@@ -136,7 +139,7 @@ auto main() -> int
     }
 
     // Allocate 3 buffers on the accelerator
-    using BufAcc = alpaka::Buf<Acc, Data, Dim, Idx>;
+    using BufAcc = alpaka::Buf<DevAcc, Data, Dim, Idx>;
     BufAcc bufAccA(alpaka::allocBuf<Data, Idx>(devAcc, extent));
     BufAcc bufAccB(alpaka::allocBuf<Data, Idx>(devAcc, extent));
     BufAcc bufAccC(alpaka::allocBuf<Data, Idx>(devAcc, extent));
diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSycl.hpp
similarity index 65%
rename from include/alpaka/acc/AccCpuSyclIntel.hpp
rename to include/alpaka/acc/AccCpuSycl.hpp
index f1f53fad45b6..e9e5504bae67 100644
--- a/include/alpaka/acc/AccCpuSyclIntel.hpp
+++ b/include/alpaka/acc/AccCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -9,11 +9,11 @@
 #include "alpaka/core/Concepts.hpp"
 #include "alpaka/core/DemangleTypeNames.hpp"
 #include "alpaka/core/Sycl.hpp"
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/dev/Traits.hpp"
-#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp"
+#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
 #include "alpaka/kernel/Traits.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 #include "alpaka/pltf/Traits.hpp"
 #include "alpaka/vec/Vec.hpp"
 
@@ -23,7 +23,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -31,9 +31,9 @@ namespace alpaka
     //!
     //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel CPU target device.
     template<typename TDim, typename TIdx>
-    class AccCpuSyclIntel final
+    class AccCpuSycl final
         : public AccGenericSycl<TDim, TIdx>
-        , public concepts::Implements<ConceptAcc, AccCpuSyclIntel<TDim, TIdx>>
+        , public concepts::Implements<ConceptAcc, AccCpuSycl<TDim, TIdx>>
     {
     public:
         using AccGenericSycl<TDim, TIdx>::AccGenericSycl;
@@ -44,28 +44,28 @@ namespace alpaka::trait
 {
     //! The Intel CPU SYCL accelerator name trait specialization.
     template<typename TDim, typename TIdx>
-    struct GetAccName<AccCpuSyclIntel<TDim, TIdx>>
+    struct GetAccName<AccCpuSycl<TDim, TIdx>>
     {
         static auto getAccName() -> std::string
         {
-            return "AccCpuSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
+            return "AccCpuSycl<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
         }
     };
 
     //! The Intel CPU SYCL accelerator device type trait specialization.
     template<typename TDim, typename TIdx>
-    struct DevType<AccCpuSyclIntel<TDim, TIdx>>
+    struct DevType<AccCpuSycl<TDim, TIdx>>
     {
-        using type = DevCpuSyclIntel;
+        using type = DevCpuSycl;
     };
 
     //! The Intel CPU SYCL accelerator execution task type trait specialization.
     template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
-    struct CreateTaskKernel<AccCpuSyclIntel<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
+    struct CreateTaskKernel<AccCpuSycl<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
     {
         static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args)
         {
-            return TaskKernelCpuSyclIntel<TDim, TIdx, TKernelFnObj, TArgs...>{
+            return TaskKernelCpuSycl<TDim, TIdx, TKernelFnObj, TArgs...>{
                 workDiv,
                 kernelFnObj,
                 std::forward<TArgs>(args)...};
@@ -74,21 +74,21 @@ namespace alpaka::trait
 
     //! The Intel CPU SYCL execution task platform type trait specialization.
     template<typename TDim, typename TIdx>
-    struct PltfType<AccCpuSyclIntel<TDim, TIdx>>
+    struct PltfType<AccCpuSycl<TDim, TIdx>>
     {
-        using type = PltfCpuSyclIntel;
+        using type = PltfCpuSycl;
     };
 
     template<typename TDim, typename TIdx>
-    struct AccToTag<alpaka::AccCpuSyclIntel<TDim, TIdx>>
+    struct AccToTag<alpaka::AccCpuSycl<TDim, TIdx>>
     {
-        using type = alpaka::TagCpuSyclIntel;
+        using type = alpaka::TagCpuSycl;
     };
 
     template<typename TDim, typename TIdx>
-    struct TagToAcc<alpaka::TagCpuSyclIntel, TDim, TIdx>
+    struct TagToAcc<alpaka::TagCpuSycl, TDim, TIdx>
     {
-        using type = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+        using type = alpaka::AccCpuSycl<TDim, TIdx>;
     };
 } // namespace alpaka::trait
 
diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp
index 7ddc7b2b65de..7e84ce6d5e39 100644
--- a/include/alpaka/acc/AccFpgaSyclIntel.hpp
+++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -22,7 +22,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -45,7 +45,7 @@ namespace alpaka::trait
     template<typename TDim, typename TIdx>
     struct GetAccName<AccFpgaSyclIntel<TDim, TIdx>>
     {
-        ALPAKA_FN_HOST static auto getAccName() -> std::string
+        static auto getAccName() -> std::string
         {
             return "AccFpgaSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled<TIdx> + ">";
         }
diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp
index 211d89a7740f..34400442c31c 100644
--- a/include/alpaka/acc/AccGenericSycl.hpp
+++ b/include/alpaka/acc/AccGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -15,6 +15,7 @@
 #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp"
 #include "alpaka/math/MathGenericSycl.hpp"
 #include "alpaka/mem/fence/MemFenceGenericSycl.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/warp/WarpGenericSycl.hpp"
 #include "alpaka/workdiv/WorkDivGenericSycl.hpp"
 
@@ -37,7 +38,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -56,40 +57,17 @@ namespace alpaka
         , public BlockSyncGenericSycl<TDim>
         , public IntrinsicGenericSycl
         , public MemFenceGenericSycl
+        , public rand::RandGenericSycl<TDim>
         , public warp::WarpGenericSycl<TDim>
     {
+        static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero.");
+
     public:
         AccGenericSycl(AccGenericSycl const&) = delete;
         AccGenericSycl(AccGenericSycl&&) = delete;
         auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete;
         auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete;
 
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-        AccGenericSycl(
-            Vec<TDim, TIdx> const& threadElemExtent,
-            sycl::nd_item<TDim::value> work_item,
-            sycl::local_accessor<std::byte> dyn_shared_acc,
-            sycl::local_accessor<std::byte> st_shared_acc,
-            sycl::accessor<int, 1, sycl::access_mode::read_write, sycl::target::device> global_fence_dummy,
-            sycl::local_accessor<int> local_fence_dummy,
-            sycl::stream output_stream)
-            : WorkDivGenericSycl<TDim, TIdx>{threadElemExtent, work_item}
-            , gb::IdxGbGenericSycl<TDim, TIdx>{work_item}
-            , bt::IdxBtGenericSycl<TDim, TIdx>{work_item}
-            , AtomicHierarchy<AtomicGenericSycl, AtomicGenericSycl, AtomicGenericSycl>{}
-            , math::MathGenericSycl{}
-            , BlockSharedMemDynGenericSycl{dyn_shared_acc}
-            , BlockSharedMemStGenericSycl{st_shared_acc}
-            , BlockSyncGenericSycl<TDim>{work_item}
-            , IntrinsicGenericSycl{}
-            , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
-            , warp::WarpGenericSycl<TDim>{work_item}
-            , cout{output_stream}
-        {
-        }
-
-        sycl::stream cout;
-#    else
         AccGenericSycl(
             Vec<TDim, TIdx> const& threadElemExtent,
             sycl::nd_item<TDim::value> work_item,
@@ -107,10 +85,10 @@ namespace alpaka
             , BlockSyncGenericSycl<TDim>{work_item}
             , IntrinsicGenericSycl{}
             , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy}
+            , rand::RandGenericSycl<TDim>{work_item}
             , warp::WarpGenericSycl<TDim>{work_item}
         {
         }
-#    endif
     };
 } // namespace alpaka
 
diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp
index f950bb03b56e..03a6449a75a9 100644
--- a/include/alpaka/acc/Tag.hpp
+++ b/include/alpaka/acc/Tag.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Simeon Ehrig, Jan Stephan
+/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,7 +23,7 @@ namespace alpaka
     CREATE_ACC_TAG(TagCpuOmp2Blocks);
     CREATE_ACC_TAG(TagCpuOmp2Threads);
     CREATE_ACC_TAG(TagCpuSerial);
-    CREATE_ACC_TAG(TagCpuSyclIntel);
+    CREATE_ACC_TAG(TagCpuSycl);
     CREATE_ACC_TAG(TagCpuTbbBlocks);
     CREATE_ACC_TAG(TagCpuThreads);
     CREATE_ACC_TAG(TagFpgaSyclIntel);
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
index 43829f951fde..327e53fab9f0 100644
--- a/include/alpaka/alpaka.hpp
+++ b/include/alpaka/alpaka.hpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber,
- *                Jan Stephan, Antonio Di Pilato
+ *                Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,7 +13,7 @@
 #include "alpaka/acc/AccCpuOmp2Blocks.hpp"
 #include "alpaka/acc/AccCpuOmp2Threads.hpp"
 #include "alpaka/acc/AccCpuSerial.hpp"
-#include "alpaka/acc/AccCpuSyclIntel.hpp"
+#include "alpaka/acc/AccCpuSycl.hpp"
 #include "alpaka/acc/AccCpuTbbBlocks.hpp"
 #include "alpaka/acc/AccCpuThreads.hpp"
 #include "alpaka/acc/AccDevProps.hpp"
@@ -75,7 +75,7 @@
 #include "alpaka/core/Vectorize.hpp"
 // dev
 #include "alpaka/dev/DevCpu.hpp"
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/dev/DevCudaRt.hpp"
 #include "alpaka/dev/DevFpgaSyclIntel.hpp"
 #include "alpaka/dev/DevGenericSycl.hpp"
@@ -89,7 +89,7 @@
 #include "alpaka/dim/Traits.hpp"
 // event
 #include "alpaka/event/EventCpu.hpp"
-#include "alpaka/event/EventCpuSyclIntel.hpp"
+#include "alpaka/event/EventCpuSycl.hpp"
 #include "alpaka/event/EventCudaRt.hpp"
 #include "alpaka/event/EventFpgaSyclIntel.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
@@ -114,7 +114,7 @@
 #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp"
 #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp"
 #include "alpaka/kernel/TaskKernelCpuSerial.hpp"
-#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp"
+#include "alpaka/kernel/TaskKernelCpuSycl.hpp"
 #include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp"
 #include "alpaka/kernel/TaskKernelCpuThreads.hpp"
 #include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp"
@@ -133,7 +133,7 @@
 #include "alpaka/mem/alloc/AllocCpuNew.hpp"
 #include "alpaka/mem/alloc/Traits.hpp"
 #include "alpaka/mem/buf/BufCpu.hpp"
-#include "alpaka/mem/buf/BufCpuSyclIntel.hpp"
+#include "alpaka/mem/buf/BufCpuSycl.hpp"
 #include "alpaka/mem/buf/BufCudaRt.hpp"
 #include "alpaka/mem/buf/BufFpgaSyclIntel.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
@@ -178,7 +178,7 @@
 #include "alpaka/offset/Traits.hpp"
 // platform
 #include "alpaka/pltf/PltfCpu.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 #include "alpaka/pltf/PltfCudaRt.hpp"
 #include "alpaka/pltf/PltfFpgaSyclIntel.hpp"
 #include "alpaka/pltf/PltfGpuSyclIntel.hpp"
@@ -186,6 +186,7 @@
 #include "alpaka/pltf/Traits.hpp"
 // rand
 #include "alpaka/rand/RandDefault.hpp"
+#include "alpaka/rand/RandGenericSycl.hpp"
 #include "alpaka/rand/RandPhilox.hpp"
 #include "alpaka/rand/RandStdLib.hpp"
 #include "alpaka/rand/RandUniformCudaHipRand.hpp"
@@ -196,8 +197,8 @@
 #include "alpaka/queue/Properties.hpp"
 #include "alpaka/queue/QueueCpuBlocking.hpp"
 #include "alpaka/queue/QueueCpuNonBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclIntelBlocking.hpp"
-#include "alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclBlocking.hpp"
+#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp"
 #include "alpaka/queue/QueueCudaRtBlocking.hpp"
 #include "alpaka/queue/QueueCudaRtNonBlocking.hpp"
 #include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp"
diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp
index 742adcdfa3e7..8ebf608dc570 100644
--- a/include/alpaka/atomic/AtomicGenericSycl.hpp
+++ b/include/alpaka/atomic/AtomicGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -14,7 +14,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -98,17 +98,12 @@ namespace alpaka
         inline auto casWithCondition(T* const addr, TEval&& eval)
         {
             auto ref = TRef{*addr};
-
             auto old_val = ref.load();
-            auto assumed = T{};
 
-            do
+            // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive
+            while(!ref.compare_exchange_weak(old_val, eval(old_val)))
             {
-                assumed = old_val;
-                auto const new_val = eval(old_val);
-                old_val = ref.compare_exchange_strong(assumed, new_val);
-            } while(assumed != old_val);
-
+            }
 
             return old_val;
         }
@@ -182,7 +177,9 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicExch, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
+        static_assert(
+            (std::is_integral_v<T> || std::is_floating_point_v<T>) &&(sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics do not support this type");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
@@ -195,11 +192,14 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicInc, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_unsigned_v<T>, "atomicInc only supported for unsigned types");
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
-            auto inc = [&value](auto old_val) { return (old_val >= value) ? static_cast<T>(0) : (old_val + 1u); };
+            auto inc = [&value](auto old_val)
+            { return (old_val >= value) ? static_cast<T>(0) : (old_val + static_cast<T>(1)); };
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
                 return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, inc);
             else
@@ -212,12 +212,14 @@ namespace alpaka::trait
     template<typename T, typename THierarchy>
     struct AtomicOp<AtomicDec, AtomicGenericSycl, T, THierarchy>
     {
-        static_assert(std::is_unsigned_v<T>, "atomicDec only supported for unsigned types");
+        static_assert(
+            std::is_unsigned_v<T> && (sizeof(T) == 4 || sizeof(T) == 8),
+            "SYCL atomics support only 32- and 64-bits unsigned integral types");
 
         static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T
         {
-            auto dec
-                = [&value](auto& old_val) { return ((old_val == 0) || (old_val > value)) ? value : (old_val - 1u); };
+            auto dec = [&value](auto& old_val)
+            { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast<T>(1)); };
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
                 return alpaka::detail::casWithCondition<alpaka::detail::global_ref<T, THierarchy>>(addr, dec);
             else
@@ -275,22 +277,21 @@ namespace alpaka::trait
     {
         static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>, "SYCL atomics do not support this type");
 
-        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& compare, T const& value) -> T
+        static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T
         {
-            auto cas = [&compare, &value](auto& ref)
+            auto cas = [&expected, &desired](auto& ref)
             {
-                // SYCL stores the value in *addr to the "compare" parameter if the values are not equal. Since
-                // alpaka's interface does not expect this we need to copy "compare" to this function and forget it
-                // afterwards.
-                auto tmp = compare;
-
-                // We always want to return the old value at the end.
-                const auto old = ref.load();
-
-                // This returns a bool telling us if the exchange happened or not. Useless in this case.
-                ref.compare_exchange_strong(tmp, value);
-
-                return old;
+                auto expected_ = expected;
+                // Atomically compares the value of `ref` with the value of `expected`.
+                // If the values are equal, replaces the value of `ref` with `desired`.
+                // Otherwise updates `expected` with the value of `ref`.
+                // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of
+                // it.
+                ref.compare_exchange_strong(expected_, desired);
+
+                // If the update succeded, return the previous value of `ref`.
+                // Otherwise, return the current value of `ref`.
+                return expected_;
             };
 
             if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr)
diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
index d13b78249867..4e2af194ddcd 100644
--- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
+++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp
@@ -9,7 +9,7 @@
 #include <cstddef>
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -35,9 +35,7 @@ namespace alpaka::trait
     {
         static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T*
         {
-            auto void_ptr = sycl::multi_ptr<void, sycl::access::address_space::local_space>{shared.m_accessor};
-            auto t_ptr = static_cast<sycl::multi_ptr<T, sycl::access::address_space::local_space>>(void_ptr);
-            return t_ptr.get();
+            return reinterpret_cast<T*>(shared.m_accessor.get_pointer().get());
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
index baae7b0c8825..f92df9c051f0 100644
--- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
+++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp
@@ -7,11 +7,12 @@
 #include "alpaka/block/shared/st/Traits.hpp"
 #include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp"
 
+#include <cstddef>
 #include <cstdint>
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
index 87d478eb9c19..67e97493fee4 100644
--- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
+++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp
@@ -8,7 +8,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp
index 39621e964b0f..9a54f33200f8 100644
--- a/include/alpaka/core/BoostPredef.hpp
+++ b/include/alpaka/core/BoostPredef.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -68,3 +68,11 @@
 #    undef BOOST_COMP_PGI
 #    define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED
 #endif
+
+// Intel LLVM compiler detection
+#if !defined(BOOST_COMP_ICPX)
+#    if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER)
+// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0).
+#        define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER)
+#    endif
+#endif
diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp
index 50b74df690be..6da0b9fbd620 100644
--- a/include/alpaka/core/Common.hpp
+++ b/include/alpaka/core/Common.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -76,13 +76,12 @@
 //! Macro defining the inline function attribute.
 #if BOOST_LANG_CUDA || BOOST_LANG_HIP
 #    define ALPAKA_FN_INLINE __forceinline__
-#else
-#    if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
+#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED)
 // TODO: With C++20 [[msvc::forceinline]] can be used.
-#        define ALPAKA_FN_INLINE __forceinline
-#    else
-#        define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
-#    endif
+#    define ALPAKA_FN_INLINE __forceinline
+#else
+// For gcc, clang, and clang-based compilers like Intel icpx
+#    define ALPAKA_FN_INLINE [[gnu::always_inline]] inline
 #endif
 
 //! This macro defines a variable lying in global accelerator device memory.
@@ -117,6 +116,8 @@
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
 #    define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#    define ALPAKA_STATIC_ACC_MEM_GLOBAL _Pragma("GCC error \"The SYCL backend does not support global device variables.\""))
 #else
 #    define ALPAKA_STATIC_ACC_MEM_GLOBAL
 #endif
@@ -153,6 +154,8 @@
 #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX)              \
     || BOOST_LANG_HIP)
 #    define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__
+#elif defined(ALPAKA_ACC_SYCL_ENABLED)
+#    define ALPAKA_STATIC_ACC_MEM_CONSTANT _Pragma("GCC error \"The SYCL backend does not support global device constants.\""))
 #else
 #    define ALPAKA_STATIC_ACC_MEM_CONSTANT
 #endif
diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp
index cab73fe17f00..dbc8c1e0f748 100644
--- a/include/alpaka/core/Sycl.hpp
+++ b/include/alpaka/core/Sycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,6 +13,7 @@
 
 #include <array>
 #include <cstddef>
+#include <cstdio> // the #define printf(...) breaks <cstdio> if it is included afterwards
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -21,7 +22,36 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
+
+// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support
+#    ifdef __AMDGCN__
+#        define printf(...)
+#    else
+
+#        ifdef __SYCL_DEVICE_ONLY__
+using AlpakaFormat = char const* [[clang::opencl_constant]];
+#        else
+using AlpakaFormat = char const*;
+#        endif
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic push
+#            pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#        endif
+
+#        define printf(FORMAT, ...)                                                                                   \
+            do                                                                                                        \
+            {                                                                                                         \
+                static auto const format = AlpakaFormat{FORMAT};                                                      \
+                sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__);                                       \
+            } while(false)
+
+#        if BOOST_COMP_CLANG
+#            pragma clang diagnostic pop
+#        endif
+
+#    endif
 
 // SYCL vector types trait specializations.
 namespace alpaka
@@ -50,7 +80,6 @@ namespace alpaka
 
               // 2 component vector types
               sycl::char2,
-              sycl::schar2,
               sycl::uchar2,
               sycl::short2,
               sycl::ushort2,
@@ -58,15 +87,12 @@ namespace alpaka
               sycl::uint2,
               sycl::long2,
               sycl::ulong2,
-              sycl::longlong2,
-              sycl::ulonglong2,
               sycl::float2,
               sycl::double2,
               sycl::half2,
 
               // 3 component vector types
               sycl::char3,
-              sycl::schar3,
               sycl::uchar3,
               sycl::short3,
               sycl::ushort3,
@@ -74,15 +100,12 @@ namespace alpaka
               sycl::uint3,
               sycl::long3,
               sycl::ulong3,
-              sycl::longlong3,
-              sycl::ulonglong3,
               sycl::float3,
               sycl::double3,
               sycl::half3,
 
               // 4 component vector types
               sycl::char4,
-              sycl::schar4,
               sycl::uchar4,
               sycl::short4,
               sycl::ushort4,
@@ -90,15 +113,12 @@ namespace alpaka
               sycl::uint4,
               sycl::long4,
               sycl::ulong4,
-              sycl::longlong4,
-              sycl::ulonglong4,
               sycl::float4,
               sycl::double4,
               sycl::half4,
 
               // 8 component vector types
               sycl::char8,
-              sycl::schar8,
               sycl::uchar8,
               sycl::short8,
               sycl::ushort8,
@@ -106,15 +126,12 @@ namespace alpaka
               sycl::uint8,
               sycl::long8,
               sycl::ulong8,
-              sycl::longlong8,
-              sycl::ulonglong8,
               sycl::float8,
               sycl::double8,
               sycl::half8,
 
               // 16 component vector types
               sycl::char16,
-              sycl::schar16,
               sycl::uchar16,
               sycl::short16,
               sycl::ushort16,
@@ -122,8 +139,6 @@ namespace alpaka
               sycl::uint16,
               sycl::long16,
               sycl::ulong16,
-              sycl::longlong16,
-              sycl::ulonglong16,
               sycl::float16,
               sycl::double16,
               sycl::half16>
@@ -146,10 +161,7 @@ namespace alpaka::trait
     {
         using type = std::conditional_t<std::is_scalar_v<T>, T, typename T::element_type>;
     };
-} // namespace alpaka::trait
 
-namespace alpaka::trait
-{
     //! The SYCL vectors' extent get trait specialization.
     template<typename TExtent>
     struct GetExtent<DimInt<Dim<TExtent>::value>, TExtent, std::enable_if_t<IsSyclBuiltInType<TExtent>::value>>
diff --git a/include/alpaka/dev/DevCpuSyclIntel.hpp b/include/alpaka/dev/DevCpuSycl.hpp
similarity index 65%
rename from include/alpaka/dev/DevCpuSyclIntel.hpp
rename to include/alpaka/dev/DevCpuSycl.hpp
index 8add6869e3da..1a9909d31cd0 100644
--- a/include/alpaka/dev/DevCpuSyclIntel.hpp
+++ b/include/alpaka/dev/DevCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/pltf/PltfCpuSyclIntel.hpp"
+#include "alpaka/pltf/PltfCpuSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using DevCpuSyclIntel = DevGenericSycl<PltfCpuSyclIntel>;
+    using DevCpuSycl = DevGenericSycl<PltfCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp
index 5892df450df3..85c218365f3d 100644
--- a/include/alpaka/dev/DevGenericSycl.hpp
+++ b/include/alpaka/dev/DevGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -27,7 +27,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -176,7 +176,12 @@ namespace alpaka::trait
         static auto getWarpSizes(DevGenericSycl<TPltf> const& dev) -> std::vector<std::size_t>
         {
             auto const device = dev.getNativeHandle().first;
-            return device.template get_info<sycl::info::device::sub_group_sizes>();
+            std::vector<std::size_t> warp_sizes = device.template get_info<sycl::info::device::sub_group_sizes>();
+            // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not
+            auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64);
+            if(find64 != warp_sizes.end())
+                warp_sizes.erase(find64);
+            return warp_sizes;
         }
     };
 
@@ -204,7 +209,7 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct BufType<DevGenericSycl<TPltf>, TElem, TDim, TIdx>
     {
-        using type = BufGenericSycl<TElem, TDim, TIdx, DevGenericSycl<TPltf>>;
+        using type = BufGenericSycl<TElem, TDim, TIdx, TPltf>;
     };
 
     //! The SYCL device platform type trait specialization.
diff --git a/include/alpaka/event/EventCpuSyclIntel.hpp b/include/alpaka/event/EventCpuSycl.hpp
similarity index 65%
rename from include/alpaka/event/EventCpuSyclIntel.hpp
rename to include/alpaka/event/EventCpuSycl.hpp
index 21a752e28991..37e521484e93 100644
--- a/include/alpaka/event/EventCpuSyclIntel.hpp
+++ b/include/alpaka/event/EventCpuSycl.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/event/EventGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using EventCpuSyclIntel = EventGenericSycl<DevCpuSyclIntel>;
+    using EventCpuSycl = EventGenericSycl<DevCpuSycl>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp
index 3e84baec3810..68011a0247cd 100644
--- a/include/alpaka/event/EventGenericSycl.hpp
+++ b/include/alpaka/event/EventGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -17,7 +17,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -87,7 +87,7 @@ namespace alpaka::trait
     {
         static auto enqueue(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
         {
-            event.setEvent(queue.m_impl->get_last_event());
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
     };
 
@@ -97,7 +97,7 @@ namespace alpaka::trait
     {
         static auto enqueue(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev>& event)
         {
-            event.setEvent(queue.m_impl->get_last_event());
+            event.setEvent(queue.m_spQueueImpl->get_last_event());
         }
     };
 
@@ -120,7 +120,7 @@ namespace alpaka::trait
     {
         static auto waiterWaitFor(QueueGenericSyclNonBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
         {
-            queue.m_impl->register_dependency(event.getNativeHandle());
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
     };
 
@@ -130,7 +130,7 @@ namespace alpaka::trait
     {
         static auto waiterWaitFor(QueueGenericSyclBlocking<TDev>& queue, EventGenericSycl<TDev> const& event)
         {
-            queue.m_impl->register_dependency(event.getNativeHandle());
+            queue.m_spQueueImpl->register_dependency(event.getNativeHandle());
         }
     };
 
diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp
index f90b3653c9f4..6ceb874dd708 100644
--- a/include/alpaka/example/ExampleDefaultAcc.hpp
+++ b/include/alpaka/example/ExampleDefaultAcc.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -24,16 +24,16 @@ namespace alpaka
     using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads<TDim, TIdx>;
 #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED)
     using ExampleDefaultAcc = alpaka::AccCpuThreads<TDim, TIdx>;
-#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
-    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
 #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI)
 #    if defined(ALPAKA_SYCL_ONEAPI_CPU)
-    using ExampleDefaultAcc = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+    using ExampleDefaultAcc = alpaka::AccCpuSycl<TDim, TIdx>;
 #    elif defined(ALPAKA_SYCL_ONEAPI_FPGA)
     using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel<TDim, TIdx>;
 #    elif defined(ALPAKA_SYCL_ONEAPI_GPU)
     using ExampleDefaultAcc = alpaka::AccGpuSyclIntel<TDim, TIdx>;
 #    endif
+#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED)
+    using ExampleDefaultAcc = alpaka::AccCpuSerial<TDim, TIdx>;
 #else
     class ExampleDefaultAcc;
 #    warning "No supported backend selected."
diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
index 88628d343c18..54ef78014f1f 100644
--- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
+++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::bt
 {
@@ -23,11 +23,11 @@ namespace alpaka::bt
     public:
         using IdxBtBase = IdxBtGenericSycl;
 
-        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
+        explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_bt{work_item}
         {
         }
 
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_bt;
     };
 } // namespace alpaka::bt
 
@@ -49,19 +49,19 @@ namespace alpaka::trait
         static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&) -> Vec<TDim, TIdx>
         {
             if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                return Vec<TDim, TIdx>{static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.my_item.get_local_id(1)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(idx.my_item.get_local_id(2)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(1)),
-                    static_cast<TIdx>(idx.my_item.get_local_id(0))};
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(2)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
+                    static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
             }
         }
     };
diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
index 8fa8a5bb0f35..42547effd6c3 100644
--- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
+++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::gb
 {
@@ -23,11 +23,11 @@ namespace alpaka::gb
     public:
         using IdxGbBase = IdxGbGenericSycl;
 
-        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
+        explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_gb{work_item}
         {
         }
 
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_gb;
     };
 } // namespace alpaka::gb
 
@@ -49,19 +49,19 @@ namespace alpaka::trait
         static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&)
         {
             if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.my_item.get_group(0)));
+                return Vec<TDim, TIdx>(static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.my_item.get_group(1)),
-                    static_cast<TIdx>(idx.my_item.get_group(0)));
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             }
             else
             {
                 return Vec<TDim, TIdx>(
-                    static_cast<TIdx>(idx.my_item.get_group(2)),
-                    static_cast<TIdx>(idx.my_item.get_group(1)),
-                    static_cast<TIdx>(idx.my_item.get_group(0)));
+                    static_cast<TIdx>(idx.m_item_gb.get_group(2)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(1)),
+                    static_cast<TIdx>(idx.m_item_gb.get_group(0)));
             }
         }
     };
diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
index 700442fb4520..395043a9cd95 100644
--- a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
+++ b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp
@@ -11,7 +11,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp
new file mode 100644
index 000000000000..b56b652c7b9e
--- /dev/null
+++ b/include/alpaka/kernel/SyclSubgroupSize.hpp
@@ -0,0 +1,97 @@
+/* Copyright 2023 Andrea Bocci, Aurora Perego
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+#    ifdef __SYCL_DEVICE_ONLY__
+
+#        if defined(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */                     \
+            defined(__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */                       \
+            defined(__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */                     \
+            defined(__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */                  \
+            defined(__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */                    \
+            defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */                  \
+            defined(__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */                   \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_RPL_S__) || /* Raptor Lake Intel graphics architecture */                 \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */                \
+            defined(__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */                           \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */             \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */             \
+            defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) /* Alchemist G12 Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (8 | 16 | 32)
+
+#        elif defined(__SYCL_TARGET_INTEL_GPU_PVC__) /* Ponte Vecchio Intel graphics architecture */
+
+#            define SYCL_SUBGROUP_SIZE (16 | 32)
+
+#        elif defined(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */
+
+#            define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64)
+
+#        elif defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */ \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */   \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */      \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */             \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) || /* NVIDIA Volta architecture (compute capability 7.0) */     \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */             \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) || /* NVIDIA Turing architecture (compute capability 7.5) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */    \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */  \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */     \
+            defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) /* NVIDIA Hopper architecture (compute capability 9.0) */
+
+#            define SYCL_SUBGROUP_SIZE (32)
+
+#        elif defined(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */         \
+            defined(__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
+            defined(__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */           \
+            defined(__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */        \
+            defined(__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */      \
+            defined(__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */                  \
+            defined(__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */               \
+            defined(__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */             \
+            defined(__SYCL_TARGET_AMD_GPU_GFX90A__) /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */
+
+#            define SYCL_SUBGROUP_SIZE (64)
+
+#        elif defined(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */          \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */             \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */            \
+            defined(__SYCL_TARGET_AMD_GPU_GFX1034__) /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */
+
+#            define SYCL_SUBGROUP_SIZE (32 | 64)
+
+#        else // __SYCL_TARGET_*
+
+#            define SYCL_SUBGROUP_SIZE (0) /* unknown target */
+
+#        endif // __SYCL_TARGET_*
+
+#    else
+
+#        define SYCL_SUBGROUP_SIZE (0) /* host compilation */
+
+#    endif // __SYCL_DEVICE_ONLY__
+
+#endif // ALPAKA_ACC_SYCL_ENABLED
diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
similarity index 67%
rename from include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
rename to include/alpaka/kernel/TaskKernelCpuSycl.hpp
index 18b67d51a8e1..abb8c9a81d03 100644
--- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp
+++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -11,11 +11,10 @@
 namespace alpaka
 {
     template<typename TDim, typename TIdx>
-    class AccCpuSyclIntel;
+    class AccCpuSycl;
 
     template<typename TDim, typename TIdx, typename TKernelFnObj, typename... TArgs>
-    using TaskKernelCpuSyclIntel
-        = TaskKernelGenericSycl<AccCpuSyclIntel<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
+    using TaskKernelCpuSycl = TaskKernelGenericSycl<AccCpuSycl<TDim, TIdx>, TDim, TIdx, TKernelFnObj, TArgs...>;
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
index 8dfb20fc6d92..92eab6fa5221 100644
--- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp
+++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp
@@ -1,10 +1,11 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/acc/Traits.hpp"
+#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp"
 #include "alpaka/core/BoostPredef.hpp"
 #include "alpaka/core/STLTuple/STLTuple.hpp"
 #include "alpaka/core/Sycl.hpp"
@@ -12,6 +13,7 @@
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/dim/Traits.hpp"
 #include "alpaka/idx/Traits.hpp"
+#include "alpaka/kernel/SyclSubgroupSize.hpp"
 #include "alpaka/kernel/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Accessor.hpp"
 #include "alpaka/pltf/Traits.hpp"
@@ -28,7 +30,71 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wunused-lambda-capture"
+#        pragma clang diagnostic ignored "-Wunused-parameter"
+#    endif
+
+#    include <sycl/sycl.hpp>
+
+#    define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)                                                    \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]]             \
+            {                                                                                                         \
+                auto acc = TAcc{                                                                                      \
+                    item_elements,                                                                                    \
+                    work_item,                                                                                        \
+                    dyn_shared_accessor,                                                                              \
+                    st_shared_accessor,                                                                               \
+                    global_fence_dummy,                                                                               \
+                    local_fence_dummy};                                                                               \
+                core::apply(                                                                                          \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE                                                             \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item)                                                            \
+            {                                                                                                         \
+                auto acc = TAcc{                                                                                      \
+                    item_elements,                                                                                    \
+                    work_item,                                                                                        \
+                    dyn_shared_accessor,                                                                              \
+                    st_shared_accessor,                                                                               \
+                    global_fence_dummy,                                                                               \
+                    local_fence_dummy};                                                                               \
+                core::apply(                                                                                          \
+                    [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },            \
+                    k_args);                                                                                          \
+            });
+
+#    define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL                                                                        \
+        throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));                               \
+        cgh.parallel_for(                                                                                             \
+            sycl::nd_range<TDim::value>{global_size, local_size},                                                     \
+            [item_elements,                                                                                           \
+             dyn_shared_accessor,                                                                                     \
+             st_shared_accessor,                                                                                      \
+             global_fence_dummy,                                                                                      \
+             local_fence_dummy,                                                                                       \
+             k_func,                                                                                                  \
+             k_args](sycl::nd_item<TDim::value> work_item) {});
 
 namespace alpaka::detail
 {
@@ -48,9 +114,8 @@ namespace alpaka::detail
     template<typename TElem, typename TIdx, std::size_t TDim, typename TAccessModes>
     inline auto require(
         sycl::handler& cgh,
-        experimental::
-            Accessor<detail::SyclAccessor<TElem, DimInt<TDim>::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes>
-                acc,
+        experimental::Accessor<SyclAccessor<TElem, DimInt<TDim>::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes>
+            acc,
         special)
     {
         cgh.require(acc.m_accessor);
@@ -120,48 +185,87 @@ namespace alpaka
             auto k_func = m_kernelFnObj;
             auto k_args = m_args;
 
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-            // Set up device-side printing with (user-chosen value) KiB per block for the output buffer.
-            constexpr auto buf_size = std::size_t{ALPAKA_SYCL_IOSTREAM_KIB * 1024};
-            auto buf_per_work_item = std::size_t{};
-            if constexpr(TDim::value == 1)
-                buf_per_work_item = buf_size / static_cast<std::size_t>(group_items[0]);
-            else if constexpr(TDim::value == 2)
-                buf_per_work_item = buf_size / static_cast<std::size_t>(group_items[0] * group_items[1]);
-            else
-                buf_per_work_item
-                    = buf_size / static_cast<std::size_t>(group_items[0] * group_items[1] * group_items[2]);
-
-            assert(buf_per_work_item > 0);
+            constexpr std::size_t sub_group_size = trait::warpSize<TKernelFnObj, TAcc>;
+            bool supported = false;
 
-            auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh};
-#    endif
-            cgh.parallel_for<detail::kernel<TAcc, TKernelFnObj, TArgs...>>(
-                sycl::nd_range<TDim::value>{global_size, local_size},
-                [=](sycl::nd_item<TDim::value> work_item)
-                {
-#    ifdef ALPAKA_SYCL_IOSTREAM_ENABLED
-                    auto acc = TAcc{
-                        item_elements,
-                        work_item,
-                        dyn_shared_accessor,
-                        st_shared_accessor,
-                        global_fence_dummy,
-                        local_fence_dummy,
-                        output_stream};
+            if constexpr(sub_group_size == 0)
+            {
+                // no explicit subgroup size requirement
+                LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE
+                supported = true;
+            }
+            else
+            {
+#    if(SYCL_SUBGROUP_SIZE == 0)
+                // no explicit SYCL target, assume JIT compilation
+                LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size)
+                supported = true;
 #    else
-                    auto acc = TAcc{
-                        item_elements,
-                        work_item,
-                        dyn_shared_accessor,
-                        st_shared_accessor,
-                        global_fence_dummy,
-                        local_fence_dummy};
+                // check if the kernel should be launched with a subgroup size of 4
+                if constexpr(sub_group_size == 4)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 4)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 8
+                if constexpr(sub_group_size == 8)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 8)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 16
+                if constexpr(sub_group_size == 16)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 16)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 32
+                if constexpr(sub_group_size == 32)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 32)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
+
+                // check if the kernel should be launched with a subgroup size of 64
+                if constexpr(sub_group_size == 64)
+                {
+#        if(SYCL_SUBGROUP_SIZE & 64)
+                    LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64)
+                    supported = true;
+#        else
+                    // empty kernel, required to keep SYCL happy
+                    THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL
+#        endif
+                }
 #    endif
-                    core::apply(
-                        [k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); },
-                        k_args);
-                });
+
+                // this subgroup size is not supported, raise an exception
+                if(not supported)
+                    throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported));
+            }
         }
 
         static constexpr auto is_sycl_task = true;
@@ -203,8 +307,13 @@ namespace alpaka
         TKernelFnObj m_kernelFnObj;
         core::Tuple<std::decay_t<TArgs>...> m_args;
     };
+
 } // namespace alpaka
 
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
 namespace alpaka::trait
 {
     //! The SYCL execution task accelerator type trait specialization.
@@ -243,4 +352,6 @@ namespace alpaka::trait
     };
 } // namespace alpaka::trait
 
+#    undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS
+
 #endif
diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp
index 33032cdb5c57..384b82873759 100644
--- a/include/alpaka/kernel/Traits.hpp
+++ b/include/alpaka/kernel/Traits.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber,
+ *                Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -68,6 +69,21 @@ namespace alpaka
             }
         };
 
+        //! The trait for getting the warp size required by a kernel.
+        //!
+        //! \tparam TKernelFnObj The kernel function object.
+        //! \tparam TAcc The accelerator.
+        //!
+        //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size.
+        template<typename TKernelFnObj, typename TAcc, typename TSfinae = void>
+        struct WarpSize : std::integral_constant<std::uint32_t, 0>
+        {
+        };
+
+        //! This is a shortcut for the trait defined above
+        template<typename TKernelFnObj, typename TAcc>
+        inline constexpr std::uint32_t warpSize = WarpSize<TKernelFnObj, TAcc>::value;
+
         //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator.
         //!
         //! Has no effect on other accelerators.
diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp
index cf7233826741..53898f210680 100644
--- a/include/alpaka/math/MathGenericSycl.hpp
+++ b/include/alpaka/math/MathGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 //! The mathematical operation specifics.
 namespace alpaka::math
@@ -279,7 +279,7 @@ namespace alpaka::math::trait
             if constexpr(std::is_integral_v<TArgument>)
                 return sycl::atan2(0.0, static_cast<double>(argument));
             else if constexpr(std::is_floating_point_v<TArgument>)
-                return sycl::atan2(TArgument{0.0}, argument);
+                return sycl::atan2(static_cast<TArgument>(0.0), argument);
             else
                 static_assert(!sizeof(TArgument), "Unsupported data type");
         }
@@ -333,9 +333,11 @@ namespace alpaka::math::trait
         Tx,
         std::enable_if_t<std::is_floating_point_v<Ty> && std::is_floating_point_v<Tx>>>
     {
+        using TCommon = std::common_type_t<Ty, Tx>;
+
         auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x)
         {
-            return sycl::atan2(y, x);
+            return sycl::atan2(static_cast<TCommon>(y), static_cast<TCommon>(x));
         }
     };
 
@@ -432,9 +434,11 @@ namespace alpaka::math::trait
         Ty,
         std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y)
         {
-            return sycl::fmod(x, y);
+            return sycl::fmod(static_cast<TCommon>(x), static_cast<TCommon>(y));
         }
     };
 
@@ -444,7 +448,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg)
         {
-            return sycl::isfinite(arg);
+            return static_cast<bool>(sycl::isfinite(arg));
         }
     };
 
@@ -454,7 +458,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsinfGenericSycl const&, TArg const& arg)
         {
-            return sycl::isinf(arg);
+            return static_cast<bool>(sycl::isinf(arg));
         }
     };
 
@@ -464,7 +468,7 @@ namespace alpaka::math::trait
     {
         auto operator()(math::IsnanGenericSycl const&, TArg const& arg)
         {
-            return sycl::isnan(arg);
+            return static_cast<bool>(sycl::isnan(arg));
         }
     };
 
@@ -482,18 +486,20 @@ namespace alpaka::math::trait
     template<typename Tx, typename Ty>
     struct Max<math::MaxGenericSycl, Tx, Ty, std::enable_if_t<std::is_arithmetic_v<Tx> && std::is_arithmetic_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y)
         {
             if constexpr(std::is_integral_v<Tx> && std::is_integral_v<Ty>)
-                return sycl::max(x, y);
+                return sycl::max(static_cast<TCommon>(x), static_cast<TCommon>(y));
             else if constexpr(std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>)
-                return sycl::fmax(x, y);
+                return sycl::fmax(static_cast<TCommon>(x), static_cast<TCommon>(y));
             else if constexpr(
                 (std::is_floating_point_v<Tx> && std::is_integral_v<Ty>)
                 || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
                 return sycl::fmax(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
             else
-                static_assert(!sizeof(Tx), "Unsupported data type");
+                static_assert(!sizeof(Tx), "Unsupported data types");
         }
     };
 
@@ -512,7 +518,7 @@ namespace alpaka::math::trait
                 || (std::is_integral_v<Tx> && std::is_floating_point_v<Ty>) )
                 return sycl::fmin(static_cast<double>(x), static_cast<double>(y)); // mirror CUDA back-end
             else
-                static_assert(!sizeof(Tx), "Unsupported data type");
+                static_assert(!sizeof(Tx), "Unsupported data types");
         }
     };
 
@@ -524,9 +530,11 @@ namespace alpaka::math::trait
         TExp,
         std::enable_if_t<std::is_floating_point_v<TBase> && std::is_floating_point_v<TExp>>>
     {
+        using TCommon = std::common_type_t<TBase, TExp>;
+
         auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp)
         {
-            return sycl::pow(base, exp);
+            return sycl::pow(static_cast<TCommon>(base), static_cast<TCommon>(exp));
         }
     };
 
@@ -538,9 +546,11 @@ namespace alpaka::math::trait
         Ty,
         std::enable_if_t<std::is_floating_point_v<Tx> && std::is_floating_point_v<Ty>>>
     {
+        using TCommon = std::common_type_t<Tx, Ty>;
+
         auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y)
         {
-            return sycl::remainder(x, y);
+            return sycl::remainder(static_cast<TCommon>(x), static_cast<TCommon>(y));
         }
     };
 
@@ -580,9 +590,9 @@ namespace alpaka::math::trait
     {
         auto operator()(math::RsqrtGenericSycl const&, TArg const& arg)
         {
-            if(std::is_floating_point_v<TArg>)
+            if constexpr(std::is_floating_point_v<TArg>)
                 return sycl::rsqrt(arg);
-            else if(std::is_integral_v<TArg>)
+            else if constexpr(std::is_integral_v<TArg>)
                 return sycl::rsqrt(static_cast<double>(arg)); // mirror CUDA back-end and use double for ints
             else
                 static_assert(!sizeof(TArg), "Unsupported data type");
diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp
similarity index 64%
rename from include/alpaka/mem/buf/BufCpuSyclIntel.hpp
rename to include/alpaka/mem/buf/BufCpuSycl.hpp
index 967dbc8141a4..1dbdc9f3a2ad 100644
--- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp
+++ b/include/alpaka/mem/buf/BufCpuSycl.hpp
@@ -1,10 +1,10 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/mem/buf/BufGenericSycl.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
@@ -12,7 +12,7 @@
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufCpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, DevCpuSyclIntel>;
+    using BufCpuSycl = BufGenericSycl<TElem, TDim, TIdx, PltfCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp
index 825fad4e37f1..c8849cedf7dd 100644
--- a/include/alpaka/mem/buf/BufGenericSycl.hpp
+++ b/include/alpaka/mem/buf/BufGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,7 @@
 #include "alpaka/mem/buf/BufCpu.hpp"
 #include "alpaka/mem/buf/Traits.hpp"
 #include "alpaka/mem/view/Accessor.hpp"
+#include "alpaka/mem/view/ViewAccessOps.hpp"
 #include "alpaka/vec/Vec.hpp"
 
 #include <memory>
@@ -19,27 +20,27 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
     //! The SYCL memory buffer.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    class BufGenericSycl
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    class BufGenericSycl : public internal::ViewAccessOps<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
+    public:
         static_assert(
             !std::is_const_v<TElem>,
             "The elem type of the buffer can not be const because the C++ Standard forbids containers of const "
             "elements!");
         static_assert(!std::is_const_v<TIdx>, "The idx type of the buffer can not be const!");
 
-    public:
         //! Constructor
-        template<typename TExtent>
-        BufGenericSycl(TDev const& dev, sycl::buffer<TElem, TDim::value> buffer, TExtent const& extent)
+        template<typename TExtent, typename Deleter>
+        BufGenericSycl(DevGenericSycl<TPltf> const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent)
             : m_dev{dev}
             , m_extentElements{getExtentVecEnd<TDim>(extent)}
-            , m_buffer{buffer}
+            , m_spMem(pMem, std::move(deleter))
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
@@ -53,92 +54,100 @@ namespace alpaka
                 "The idx type of TExtent and the TIdx template parameter have to be identical!");
         }
 
-        TDev m_dev;
+        DevGenericSycl<TPltf> m_dev;
         Vec<TDim, TIdx> m_extentElements;
-        sycl::buffer<TElem, TDim::value> m_buffer;
+        std::shared_ptr<TElem> m_spMem;
     };
 } // namespace alpaka
 
 namespace alpaka::trait
 {
     //! The BufGenericSycl device type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct DevType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        using type = TDev;
+        using type = DevGenericSycl<TPltf>;
     };
 
     //! The BufGenericSycl device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TDev> const& buf)
+        static auto getDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf)
         {
             return buf.m_dev;
         }
     };
 
     //! The BufGenericSycl dimension getter trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct DimType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TDim;
     };
 
     //! The BufGenericSycl memory element type get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct ElemType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TElem;
     };
 
     //! The BufGenericSycl extent get trait specialization.
-    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetExtent<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetExtent<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds");
 
-        static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TDev> const& buf) -> TIdx
+        static auto getExtent(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TIdx
         {
             return buf.m_extentElements[TIdxIntegralConst::value];
         }
     };
 
     //! The BufGenericSycl native pointer get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetPtrNative<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static_assert(
-            !sizeof(TElem),
-            "Accessing device-side pointers on the host is not supported by the SYCL back-end");
-
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TDev> const&) -> TElem const*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf) -> TElem const*
         {
-            return nullptr;
+            return buf.m_spMem.get();
         }
 
-        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TDev>&) -> TElem*
+        static auto getPtrNative(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf) -> TElem*
         {
-            return nullptr;
+            return buf.m_spMem.get();
         }
     };
 
     //! The BufGenericSycl pointer on device get trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TDev>, TDev>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetPtrDev<BufGenericSycl<TElem, TDim, TIdx, TPltf>, DevGenericSycl<TPltf>>
     {
-        static_assert(
-            !sizeof(TElem),
-            "Accessing device-side pointers on the host is not supported by the SYCL back-end");
-
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TDev> const&, TDev const&) -> TElem const*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TPltf> const& buf, DevGenericSycl<TPltf> const& dev)
+            -> TElem const*
         {
-            return nullptr;
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
         }
 
-        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TDev>&, TDev const&) -> TElem*
+        static auto getPtrDev(BufGenericSycl<TElem, TDim, TIdx, TPltf>& buf, DevGenericSycl<TPltf> const& dev)
+            -> TElem*
         {
-            return nullptr;
+            if(dev == getDev(buf))
+            {
+                return buf.m_spMem.get();
+            }
+            else
+            {
+                throw std::runtime_error("The buffer is not accessible from the given device!");
+            }
         }
     };
 
@@ -147,68 +156,91 @@ namespace alpaka::trait
     struct BufAlloc<TElem, TDim, TIdx, DevGenericSycl<TPltf>>
     {
         template<typename TExtent>
-        static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& ext)
-            -> BufGenericSycl<TElem, TDim, TIdx, DevGenericSycl<TPltf>>
+        static auto allocBuf(DevGenericSycl<TPltf> const& dev, TExtent const& extent)
+            -> BufGenericSycl<TElem, TDim, TIdx, TPltf>
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
 
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
             if constexpr(TDim::value == 0 || TDim::value == 1)
             {
-                auto const width = getWidth(ext);
+                auto const width = getWidth(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n';
-#    endif
-
-                auto const range = sycl::range<1>{width};
-                return {dev, sycl::buffer<TElem, 1>{range}, ext};
             }
             else if constexpr(TDim::value == 2)
             {
-                auto const width = getWidth(ext);
-                auto const height = getHeight(ext);
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes
                           << " pitch: " << widthBytes << '\n';
-#    endif
-
-                auto const range = sycl::range<2>{width, height};
-                return {dev, sycl::buffer<TElem, 2>{range}, ext};
             }
             else if constexpr(TDim::value == 3)
             {
-                auto const width = getWidth(ext);
-                auto const height = getHeight(ext);
-                auto const depth = getDepth(ext);
+                auto const width = getWidth(extent);
+                auto const height = getHeight(extent);
+                auto const depth = getDepth(extent);
 
-#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
                 auto const widthBytes = width * static_cast<TIdx>(sizeof(TElem));
                 std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth
                           << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n';
+            }
 #    endif
 
-                auto const range = sycl::range<3>{width, height, depth};
-                return {dev, sycl::buffer<TElem, 3>{range}, ext};
-            }
+            auto const& [nativeDev, nativeContext] = dev.getNativeHandle();
+            TElem* memPtr = sycl::malloc_device<TElem>(
+                static_cast<std::size_t>(getExtentProduct(extent)),
+                nativeDev,
+                nativeContext);
+            // captured structured bindings are a C++20 extension
+            // auto deleter = [nativeContext](TElem* ptr) { sycl::free(ptr, nativeContext); };
+            auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); };
+
+            return BufGenericSycl<TElem, TDim, TIdx, TPltf>(dev, memPtr, std::move(deleter), extent);
         }
     };
 
+    //! The BufGenericSycl stream-ordered memory allocation capability trait specialization.
+    template<typename TDim, typename TPltf>
+    struct HasAsyncBufSupport<TDim, DevGenericSycl<TPltf>> : std::false_type
+    {
+    };
+
     //! The BufGenericSycl offset get trait specialization.
-    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct GetOffset<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TIdxIntegralConst, typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct GetOffset<TIdxIntegralConst, BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
-        static auto getOffset(BufGenericSycl<TElem, TDim, TIdx, TDev> const&) -> TIdx
+        static auto getOffset(BufGenericSycl<TElem, TDim, TIdx, TPltf> const&) -> TIdx
         {
             return 0u;
         }
     };
 
+    //! The pinned/mapped memory allocation trait specialization for the SYCL devices.
+    template<typename TPltf, typename TElem, typename TDim, typename TIdx>
+    struct BufAllocMapped
+    {
+        template<typename TExtent>
+        static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) -> BufCpu<TElem, TDim, TIdx>
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+            // Allocate SYCL page-locked memory on the host, mapped into the TPltf address space and
+            // accessible to all devices in the TPltf.
+            auto ctx = TPltf::syclContext();
+            TElem* memPtr = sycl::malloc_host<TElem>(static_cast<std::size_t>(getExtentProduct(extent)), ctx);
+            auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); };
+
+            return BufCpu<TElem, TDim, TIdx>(host, memPtr, std::move(deleter), extent);
+        }
+    };
+
     //! The BufGenericSycl idx type trait specialization.
-    template<typename TElem, typename TDim, typename TIdx, typename TDev>
-    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TDev>>
+    template<typename TElem, typename TDim, typename TIdx, typename TPltf>
+    struct IdxType<BufGenericSycl<TElem, TDim, TIdx, TPltf>>
     {
         using type = TIdx;
     };
@@ -217,16 +249,13 @@ namespace alpaka::trait
     template<typename TElem, typename TDim, typename TIdx, typename TPltf>
     struct GetPtrDev<BufCpu<TElem, TDim, TIdx>, DevGenericSycl<TPltf>>
     {
-        static_assert(!sizeof(TElem), "Accessing host pointers on the device is not supported by the SYCL back-end");
-
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const&, DevGenericSycl<TPltf> const&) -> TElem const*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx> const& buf, DevGenericSycl<TPltf> const&) -> TElem const*
         {
-            return nullptr;
+            return getPtrNative(buf);
         }
-
-        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>&, DevGenericSycl<TPltf> const&) -> TElem*
+        static auto getPtrDev(BufCpu<TElem, TDim, TIdx>& buf, DevGenericSycl<TPltf> const&) -> TElem*
         {
-            return nullptr;
+            return getPtrNative(buf);
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
index 0b27ab2ae1f9..cd12f975c8d4 100644
--- a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
+++ b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2022 Jan Stephan, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,7 +12,7 @@
 namespace alpaka
 {
     template<typename TElem, typename TDim, typename TIdx>
-    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, DevGpuSyclIntel>;
+    using BufGpuSyclIntel = BufGenericSycl<TElem, TDim, TIdx, PltfGpuSyclIntel>;
 }
 
 #endif
diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp
index 5b7aeba11dac..a83cd3d7d8e9 100644
--- a/include/alpaka/mem/buf/sycl/Accessor.hpp
+++ b/include/alpaka/mem/buf/sycl/Accessor.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -13,7 +13,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -112,30 +112,6 @@ namespace alpaka
         SyclAccessor m_accessor;
         VecType extents;
     };
-
-    namespace experimental::trait
-    {
-        namespace internal
-        {
-            template<typename TElem, typename TDim, typename TIdx, typename TDev>
-            struct IsView<BufGenericSycl<TElem, TDim, TIdx, TDev>> : std::false_type
-            {
-            };
-        } // namespace internal
-
-        template<typename TElem, typename TDim, typename TIdx, typename TDev>
-        struct BuildAccessor<BufGenericSycl<TElem, TDim, TIdx, TDev>>
-        {
-            template<typename... TAccessModes>
-            static auto buildAccessor(BufGenericSycl<TElem, TDim, TIdx, TDev>& buffer)
-            {
-                using SyclAccessor = detail::SyclAccessor<TElem, TDim::value, TAccessModes...>;
-                return Accessor<SyclAccessor, TElem, TIdx, TDim::value, TAccessModes...>{
-                    SyclAccessor{buffer.m_buffer},
-                    buffer.m_extentElements};
-            }
-        };
-    } // namespace experimental::trait
 } // namespace alpaka
 
 #endif
diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp
index 80e9763ebf75..87058bc6ca92 100644
--- a/include/alpaka/mem/buf/sycl/Common.hpp
+++ b/include/alpaka/mem/buf/sycl/Common.hpp
@@ -12,7 +12,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp
index e6b1ae03836e..8e8cf533f747 100644
--- a/include/alpaka/mem/buf/sycl/Copy.hpp
+++ b/include/alpaka/mem/buf/sycl/Copy.hpp
@@ -1,8 +1,7 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
-
 #pragma once
 
 #include "alpaka/core/Debug.hpp"
@@ -14,51 +13,187 @@
 #include "alpaka/extent/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Common.hpp"
 #include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #include <memory>
 #include <type_traits>
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
-    template<typename TElem, std::size_t TDim>
-    using SrcAccessor = sycl::
-        accessor<TElem, TDim, sycl::access_mode::read, sycl::target::global_buffer, sycl::access::placeholder::true_t>;
-
-    template<typename TElem, std::size_t TDim>
-    using DstAccessor = sycl::accessor<
-        TElem,
-        TDim,
-        sycl::access_mode::write,
-        sycl::target::global_buffer,
-        sycl::access::placeholder::true_t>;
-
-    enum class Direction
+    //!  The SYCL device memory copy task base.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySyclBase
     {
-        h2d,
-        d2h,
-        d2d
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
+        using ExtentSize = Idx<TExtent>;
+        using DstSize = Idx<TViewDst>;
+        using SrcSize = Idx<TViewSrc>;
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewFwd>
+        TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            : m_extent(getExtentVec(extent))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+            , m_dstExtent(getExtentVec(viewDst))
+            , m_srcExtent(getExtentVec(viewSrc))
+#    endif
+            , m_dstPitchBytes(getPitchBytesVec(viewDst))
+            , m_srcPitchBytes(getPitchBytesVec(viewSrc))
+            , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<std::uint8_t const*>(getPtrNative(viewSrc)))
+        {
+            if constexpr(TDim::value > 0)
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT((castVec<SrcSize>(m_extent) <= m_srcExtent).foldrAll(std::logical_or<bool>()));
+            }
+        }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+        auto printDebug() const -> void
+        {
+            std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes
+                      << " de: " << m_dstExtent << " dptr: " << reinterpret_cast<void*>(m_dstMemNative)
+                      << " se: " << m_srcExtent << " sptr: " << reinterpret_cast<void const*>(m_srcMemNative)
+                      << std::endl;
+        }
+#    endif
+
+        Vec<TDim, ExtentSize> const m_extent;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+        ExtentSize const m_extentWidthBytes;
+        Vec<TDim, DstSize> const m_dstExtent;
+        Vec<TDim, SrcSize> const m_srcExtent;
+#    endif
+
+        Vec<TDim, DstSize> const m_dstPitchBytes;
+        Vec<TDim, SrcSize> const m_srcPitchBytes;
+        std::uint8_t* const m_dstMemNative;
+        std::uint8_t const* const m_srcMemNative;
+        static constexpr auto is_sycl_task = true;
     };
 
-    template<typename TSrc, typename TDst, Direction TDirection>
-    struct TaskCopySycl
+    //! The SYCL device ND memory copy task.
+    template<typename TDim, typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl : public TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>
     {
-        auto operator()(sycl::handler& cgh) const -> void
+        using DimMin1 = DimInt<TDim::value - 1u>;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::ExtentSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::DstSize;
+        using typename TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::SrcSize;
+
+        using TaskCopySyclBase<TDim, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
         {
-            if constexpr(TDirection == Direction::d2h || TDirection == Direction::d2d)
-                cgh.require(m_src);
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+            // iteration.
+            Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+            // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required.
+            Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+            Vec<DimMin1, SrcSize> const srcPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_srcPitchBytes));
+
+            // Record an event for each memcpy call
+            std::vector<sycl::event> events;
+            events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                meta::ndLoopIncIdx(
+                    extentWithoutInnermost,
+                    [&](Vec<DimMin1, ExtentSize> const& idx)
+                    {
+                        events.push_back(queue.memcpy(
+                            reinterpret_cast<void*>(
+                                this->m_dstMemNative
+                                + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<DstSize>())),
+                            reinterpret_cast<void const*>(
+                                this->m_srcMemNative
+                                + (castVec<SrcSize>(idx) * srcPitchBytesWithoutOutmost)
+                                      .foldrAll(std::plus<SrcSize>())),
+                            static_cast<std::size_t>(this->m_extentWidthBytes),
+                            requirements));
+                    });
+            }
+
+            // Return an event that depends on all the events assciated to the memcpy calls
+            return queue.ext_oneapi_submit_barrier(events);
+        }
+    };
 
-            if constexpr(TDirection == Direction::h2d || TDirection == Direction::d2d)
-                cgh.require(m_dst);
+    //! The SYCL device 1D memory copy task.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+        : TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>
+    {
+        using TaskCopySyclBase<DimInt<1u>, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase;
+        using Elem = alpaka::Elem<TViewSrc>;
 
-            cgh.copy(m_src, m_dst);
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+        {
+            ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            this->printDebug();
+#    endif
+            if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+            {
+                return queue.memcpy(
+                    reinterpret_cast<void*>(this->m_dstMemNative),
+                    reinterpret_cast<void const*>(this->m_srcMemNative),
+                    sizeof(Elem) * static_cast<std::size_t>(this->m_extent.prod()),
+                    requirements);
+            }
+            else
+            {
+                return queue.ext_oneapi_submit_barrier();
+            }
         }
+    };
+
+    //! The scalar SYCL memory copy trait.
+    template<typename TViewDst, typename TViewSrc, typename TExtent>
+    struct TaskCopySycl<DimInt<0u>, TViewDst, TViewSrc, TExtent>
+    {
+        static_assert(
+            std::is_same_v<std::remove_const_t<alpaka::Elem<TViewSrc>>, std::remove_const_t<alpaka::Elem<TViewDst>>>,
+            "The source and the destination view are required to have the same element type!");
 
-        TSrc m_src;
-        TDst m_dst;
+        using Elem = alpaka::Elem<TViewSrc>;
+
+        template<typename TViewDstFwd>
+        TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent)
+            : m_dstMemNative(reinterpret_cast<void*>(getPtrNative(viewDst)))
+            , m_srcMemNative(reinterpret_cast<void const*>(getPtrNative(viewSrc)))
+        {
+            // all zero-sized extents are equivalent
+            ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u);
+            ALPAKA_ASSERT(getExtentVec(viewDst).prod() == 1u);
+            ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u);
+        }
+
+        auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+        {
+            return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements);
+        }
+
+        void* m_dstMemNative;
+        void const* m_srcMemNative;
         static constexpr auto is_sycl_task = true;
     };
 } // namespace alpaka::detail
@@ -67,76 +202,44 @@ namespace alpaka::detail
 namespace alpaka::trait
 {
     //! The SYCL host-to-device memory copy trait specialization.
-    template<typename TDim, typename TPltf>
+    template<typename TPltf, typename TDim>
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltf>, DevCpu>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = ElemType const*;
-            using DstType = alpaka::detail::DstAccessor<ElemType, copy_dim>;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset = detail::make_sycl_offset(viewDst);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::h2d>{
-                getPtrNative(viewSrc),
-                DstType{viewDst.m_buffer, range, offset}};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 
     //! The SYCL device-to-host memory copy trait specialization.
-    template<typename TDim, typename TPltf>
+    template<typename TPltf, typename TDim>
     struct CreateTaskMemcpy<TDim, DevCpu, DevGenericSycl<TPltf>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = alpaka::detail::SrcAccessor<ElemType, copy_dim>;
-            using DstType = ElemType*;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset = detail::make_sycl_offset(viewSrc);
-
-            auto view_src = const_cast<TViewSrc&>(viewSrc);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::d2h>{
-                SrcType{view_src.m_buffer, range, offset},
-                getPtrNative(viewDst)};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 
     //! The SYCL device-to-device memory copy trait specialization.
-    template<typename TDim, typename TPltfDst, typename TPltfSrc>
+    template<typename TPltfDst, typename TPltfSrc, typename TDim>
     struct CreateTaskMemcpy<TDim, DevGenericSycl<TPltfDst>, DevGenericSycl<TPltfSrc>>
     {
         template<typename TExtent, typename TViewSrc, typename TViewDstFwd>
-        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext)
+        static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent)
+            -> alpaka::detail::TaskCopySycl<TDim, std::remove_reference_t<TViewDstFwd>, TViewSrc, TExtent>
         {
             ALPAKA_DEBUG_FULL_LOG_SCOPE;
 
-            constexpr auto copy_dim = static_cast<int>(Dim<TExtent>::value);
-            using ElemType = Elem<std::remove_const_t<TViewSrc>>;
-            using SrcType = alpaka::detail::SrcAccessor<ElemType, copy_dim>;
-            using DstType = alpaka::detail::DstAccessor<ElemType, copy_dim>;
-
-            auto const range = detail::make_sycl_range(ext);
-            auto const offset_src = detail::make_sycl_offset(viewSrc);
-            auto const offset_dst = detail::make_sycl_offset(viewDst);
-
-            auto view_src = const_cast<TViewSrc&>(viewSrc);
-
-            return detail::TaskCopySycl<SrcType, DstType, detail::Direction::d2d>{
-                SrcType{view_src.m_buffer, range, offset_src},
-                DstType{viewDst.m_buffer, range, offset_dst}};
+            return {std::forward<TViewDstFwd>(viewDst), viewSrc, extent};
         }
     };
 } // namespace alpaka::trait
diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp
index f584e0461e87..01899cc18e1b 100644
--- a/include/alpaka/mem/buf/sycl/Set.hpp
+++ b/include/alpaka/mem/buf/sycl/Set.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,9 @@
 #include "alpaka/extent/Traits.hpp"
 #include "alpaka/mem/buf/sycl/Common.hpp"
 #include "alpaka/mem/view/Traits.hpp"
+#include "alpaka/meta/NdLoop.hpp"
+#include "alpaka/queue/QueueGenericSyclBlocking.hpp"
+#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 #include "alpaka/queue/Traits.hpp"
 
 #include <cstddef>
@@ -25,29 +28,171 @@ namespace alpaka
 
     namespace detail
     {
-        template<std::size_t TDim>
-        using Accessor = sycl::accessor<
-            std::byte,
-            TDim,
-            sycl::access_mode::write,
-            sycl::target::global_buffer,
-            sycl::access::placeholder::true_t>;
-
-        //! The SYCL memory set trait.
-        template<typename TAccessor>
-        struct TaskSetSycl
+        //!  The SYCL ND memory set task base.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSyclBase
         {
-            auto operator()(sycl::handler& cgh) const -> void
+            using ExtentSize = Idx<TExtent>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent)
+                : m_byte(byte)
+                , m_extent(getExtentVec(extent))
+                , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast<ExtentSize>(sizeof(Elem)))
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+                , m_dstExtent(getExtentVec(view))
+#    endif
+
+                , m_dstPitchBytes(getPitchBytesVec(view))
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+
+            {
+                ALPAKA_ASSERT((castVec<DstSize>(m_extent) <= m_dstExtent).foldrAll(std::logical_or<bool>()));
+                ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            auto printDebug() const -> void
+            {
+                std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes
+                          << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast<void*>(this->m_dstMemNative)
+                          << " dpitchb: " << this->m_dstPitchBytes << std::endl;
+            }
+#    endif
+
+            std::uint8_t const m_byte;
+            Vec<TDim, ExtentSize> const m_extent;
+            ExtentSize const m_extentWidthBytes;
+#    if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
+            Vec<TDim, DstSize> const m_dstExtent;
+#    endif
+            Vec<TDim, DstSize> const m_dstPitchBytes;
+            std::uint8_t* const m_dstMemNative;
+            static constexpr auto is_sycl_task = true;
+        };
+
+        //! The SYCL device ND memory set task.
+        template<typename TDim, typename TView, typename TExtent>
+        struct TaskSetSycl : public TaskSetSyclBase<TDim, TView, TExtent>
+        {
+            using DimMin1 = DimInt<TDim::value - 1u>;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::ExtentSize;
+            using typename TaskSetSyclBase<TDim, TView, TExtent>::DstSize;
+
+            using TaskSetSyclBase<TDim, TView, TExtent>::TaskSetSyclBase;
+
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one
+                // iteration.
+                Vec<DimMin1, ExtentSize> const extentWithoutInnermost(subVecBegin<DimMin1>(this->m_extent));
+                // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required.
+                Vec<DimMin1, DstSize> const dstPitchBytesWithoutOutmost(subVecEnd<DimMin1>(this->m_dstPitchBytes));
+
+                // Record an event for each memcpy call
+                std::vector<sycl::event> events;
+                events.reserve(static_cast<std::size_t>(extentWithoutInnermost.prod()));
+
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    meta::ndLoopIncIdx(
+                        extentWithoutInnermost,
+                        [&](Vec<DimMin1, ExtentSize> const& idx)
+                        {
+                            events.push_back(queue.memset(
+                                reinterpret_cast<void*>(
+                                    this->m_dstMemNative
+                                    + (castVec<DstSize>(idx) * dstPitchBytesWithoutOutmost)
+                                          .foldrAll(std::plus<DstSize>())),
+                                this->m_byte,
+                                static_cast<std::size_t>(this->m_extentWidthBytes),
+                                requirements));
+                        });
+                }
+
+                // Return an event that depends on all the events assciated to the memcpy calls
+                return queue.ext_oneapi_submit_barrier(events);
+            }
+        };
+
+        //! The 1D SYCL memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<1u>, TView, TExtent> : public TaskSetSyclBase<DimInt<1u>, TView, TExtent>
+        {
+            using TaskSetSyclBase<DimInt<1u>, TView, TExtent>::TaskSetSyclBase;
+
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                this->printDebug();
+#    endif
+                if(static_cast<std::size_t>(this->m_extent.prod()) != 0u)
+                {
+                    return queue.memset(
+                        reinterpret_cast<void*>(this->m_dstMemNative),
+                        this->m_byte,
+                        static_cast<std::size_t>(this->m_extentWidthBytes),
+                        requirements);
+                }
+                else
+                {
+                    return queue.ext_oneapi_submit_barrier();
+                }
+            }
+        };
+
+        //! The SYCL device scalar memory set task.
+        template<typename TView, typename TExtent>
+        struct TaskSetSycl<DimInt<0u>, TView, TExtent>
+        {
+            using ExtentSize = Idx<TExtent>;
+            using Scalar = Vec<DimInt<0u>, ExtentSize>;
+            using DstSize = Idx<TView>;
+            using Elem = alpaka::Elem<TView>;
+
+            template<typename TViewFwd>
+            TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent)
+                : m_byte(byte)
+                , m_dstMemNative(reinterpret_cast<std::uint8_t*>(getPtrNative(view)))
+            {
+                // all zero-sized extents are equivalent
+                ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u);
+                ALPAKA_ASSERT(getExtentVec(view).prod() == 1u);
+            }
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+            auto printDebug() const -> void
             {
-                cgh.require(m_accessor);
-                cgh.fill(m_accessor, m_value);
+                std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar()
+                          << " dptr: " << reinterpret_cast<void*>(m_dstMemNative) << " dpitchb: " << Scalar()
+                          << std::endl;
             }
+#    endif
 
-            TAccessor m_accessor;
-            std::byte m_value;
-            // Distinguish from non-alpaka types (= host tasks)
+            auto operator()(sycl::queue& queue, std::vector<sycl::event> const& requirements) const -> sycl::event
+            {
+                ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+#    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
+                printDebug();
+#    endif
+                return queue.memset(reinterpret_cast<void*>(m_dstMemNative), m_byte, sizeof(Elem), requirements);
+            }
+
+            std::uint8_t const m_byte;
+            std::uint8_t* const m_dstMemNative;
             static constexpr auto is_sycl_task = true;
         };
+
     } // namespace detail
 
 
@@ -57,25 +202,15 @@ namespace alpaka
         template<typename TDim, typename TPltf>
         struct CreateTaskMemset<TDim, DevGenericSycl<TPltf>>
         {
-            template<typename TExtent, typename TViewFwd>
-            static auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& ext)
+            template<typename TExtent, typename TView>
+            static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent)
+                -> detail::TaskSetSycl<TDim, TView, TExtent>
             {
-                ALPAKA_DEBUG_FULL_LOG_SCOPE;
-
-                constexpr auto set_dim = static_cast<int>(Dim<TExtent>::value);
-                using TView = std::remove_reference_t<TViewFwd>;
-                using ElemType = Elem<TView>;
-                using DstType = alpaka::detail::Accessor<set_dim>;
-
-                // Reinterpret as byte buffer
-                auto buf = view.m_buffer.template reinterpret<std::byte>();
-                auto const byte_val = static_cast<std::byte>(byte);
-
-                auto const range = detail::make_sycl_range(ext, sizeof(ElemType));
-                return detail::TaskSetSycl<DstType>{DstType{buf, range}, byte_val};
+                return detail::TaskSetSycl<TDim, TView, TExtent>(view, byte, extent);
             }
         };
+
     } // namespace trait
-} // namespace alpaka
 
+} // namespace alpaka
 #endif
diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
index c3d96113e13f..0b7559f3d61b 100644
--- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
+++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp
@@ -8,7 +8,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -32,6 +32,13 @@ namespace alpaka
             static constexpr auto scope = sycl::memory_scope::device;
             static constexpr auto space = sycl::access::address_space::global_space;
         };
+
+        template<>
+        struct SyclFenceProps<alpaka::memory_scope::Grid>
+        {
+            static constexpr auto scope = sycl::memory_scope::device;
+            static constexpr auto space = sycl::access::address_space::global_space;
+        };
     } // namespace detail
 
     //! The SYCL memory fence.
diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp
index a092a6b26f60..56fadf03feb9 100644
--- a/include/alpaka/mem/view/ViewPlainPtr.hpp
+++ b/include/alpaka/mem/view/ViewPlainPtr.hpp
@@ -1,13 +1,16 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber,
+ *                Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
 #include "alpaka/dev/DevCpu.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
 #include "alpaka/dev/DevUniformCudaHipRt.hpp"
 #include "alpaka/mem/view/Traits.hpp"
 #include "alpaka/mem/view/ViewAccessOps.hpp"
+#include "alpaka/meta/DependentFalseType.hpp"
 #include "alpaka/vec/Vec.hpp"
 
 #include <type_traits>
@@ -178,6 +181,17 @@ namespace alpaka
         };
 #endif
 
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+        //! The SYCL device CreateStaticDevMemView trait specialization.
+        template<typename TPltf>
+        struct CreateStaticDevMemView<DevGenericSycl<TPltf>>
+        {
+            static_assert(
+                meta::DependentFalseType<TPltf>::value,
+                "The SYCL backend does not support global device variables.");
+        };
+#endif
+
         //! The CPU device CreateViewPlainPtr trait specialization.
         template<>
         struct CreateViewPlainPtr<DevCpu>
@@ -215,6 +229,26 @@ namespace alpaka
         };
 #endif
 
+#if defined(ALPAKA_ACC_SYCL_ENABLED)
+        //! The SYCL device CreateViewPlainPtr trait specialization.
+        template<typename TPltf>
+        struct CreateViewPlainPtr<DevGenericSycl<TPltf>>
+        {
+            template<typename TElem, typename TExtent, typename TPitch>
+            static auto createViewPlainPtr(
+                DevGenericSycl<TPltf> const& dev,
+                TElem* pMem,
+                TExtent const& extent,
+                TPitch const& pitch)
+            {
+                return alpaka::ViewPlainPtr<DevGenericSycl<TPltf>, TElem, alpaka::Dim<TExtent>, alpaka::Idx<TExtent>>(
+                    pMem,
+                    dev,
+                    extent,
+                    pitch);
+            }
+        };
+#endif
         //! The ViewPlainPtr offset get trait specialization.
         template<typename TIdxIntegralConst, typename TDev, typename TElem, typename TDim, typename TIdx>
         struct GetOffset<TIdxIntegralConst, ViewPlainPtr<TDev, TElem, TDim, TIdx>>
diff --git a/include/alpaka/pltf/PltfCpuSycl.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp
new file mode 100644
index 000000000000..c30793dd98a9
--- /dev/null
+++ b/include/alpaka/pltf/PltfCpuSycl.hpp
@@ -0,0 +1,44 @@
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/dev/Traits.hpp"
+#include "alpaka/pltf/PltfGenericSycl.hpp"
+
+#include <string>
+
+#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
+
+#    include <sycl/sycl.hpp>
+
+namespace alpaka
+{
+    namespace detail
+    {
+        struct SyclCpuSelector
+        {
+            auto operator()(sycl::device const& dev) const -> int
+            {
+                return dev.is_cpu() ? 1 : -1;
+            }
+        };
+    } // namespace detail
+
+    //! The SYCL device manager.
+    using PltfCpuSycl = PltfGenericSycl<detail::SyclCpuSelector>;
+} // namespace alpaka
+
+namespace alpaka::trait
+{
+    //! The SYCL device manager device type trait specialization.
+    template<>
+    struct DevType<PltfCpuSycl>
+    {
+        using type = DevGenericSycl<PltfCpuSycl>; // = DevCpuSycl
+    };
+} // namespace alpaka::trait
+
+#endif
diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp
deleted file mode 100644
index b8f3815ddaed..000000000000
--- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
- * SPDX-License-Identifier: MPL-2.0
- */
-
-#pragma once
-
-#include "alpaka/dev/DevGenericSycl.hpp"
-#include "alpaka/dev/Traits.hpp"
-#include "alpaka/pltf/PltfGenericSycl.hpp"
-
-#include <string>
-
-#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
-
-#    include <CL/sycl.hpp>
-
-namespace alpaka
-{
-    namespace detail
-    {
-        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
-        // linker anyway.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        struct IntelCpuSelector final
-        {
-            auto operator()(sycl::device const& dev) const -> int
-            {
-                auto const& vendor = dev.get_info<sycl::info::device::vendor>();
-                auto const is_intel_cpu = (vendor.find("Intel(R) Corporation") != std::string::npos) && dev.is_cpu();
-
-                return is_intel_cpu ? 1 : -1;
-            }
-        };
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
-    } // namespace detail
-
-    //! The SYCL device manager.
-    using PltfCpuSyclIntel = PltfGenericSycl<detail::IntelCpuSelector>;
-} // namespace alpaka
-
-namespace alpaka::trait
-{
-    //! The SYCL device manager device type trait specialization.
-    template<>
-    struct DevType<PltfCpuSyclIntel>
-    {
-        using type = DevGenericSycl<PltfCpuSyclIntel>; // = DevCpuSyclIntel
-    };
-} // namespace alpaka::trait
-
-#endif
diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
index ef0b0cfa35c6..9b5e1b100373 100644
--- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,7 +10,7 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 #    include <string>
 
diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp
index e1e0d38ea174..8c67f5f50c31 100644
--- a/include/alpaka/pltf/PltfGenericSycl.hpp
+++ b/include/alpaka/pltf/PltfGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -20,7 +20,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -127,7 +127,7 @@ namespace alpaka::trait
 #    if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL
             printDeviceProperties(sycl_dev);
 #    elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
-            std::cout << __func__ << sycl_dev.get_info<info::device::name>() << '\n';
+            std::cout << __func__ << sycl_dev.template get_info<sycl::info::device::name>() << '\n';
 #    endif
             using SyclPltf = alpaka::PltfGenericSycl<TSelector>;
             return typename DevType<SyclPltf>::type{sycl_dev, platform.syclContext()};
@@ -189,82 +189,62 @@ namespace alpaka::trait
 
             std::cout << "SYCL version: " << device.get_info<sycl::info::device::version>() << '\n';
 
+#        if !defined(BOOST_COMP_ICPX)
+            // Not defined by Level Zero back-end
             std::cout << "Backend version: " << device.get_info<sycl::info::device::backend_version>() << '\n';
+#        endif
 
             std::cout << "Aspects: " << '\n';
-            auto const aspects = device.get_info<sycl::info::device::aspects>();
-            for(auto const& asp : aspects)
-            {
-                switch(asp)
-                {
-                // Ignore the hardware types - we already have queried this info above
-                case sycl::aspect::cpu:
-                case sycl::aspect::gpu:
-                case sycl::aspect::accelerator:
-                case sycl::aspect::custom:
-                    break;
 
-                case sycl::aspect::emulated:
-                    std::cout << "\t* emulated\n";
-                    break;
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // These aspects are missing from oneAPI versions < 2023.2.0
+            if(device.has(sycl::aspect::emulated))
+                std::cout << "\t* emulated\n";
 
-                case sycl::aspect::host_debugabble:
-                    std::cout << "\t* debugabble using standard debuggers\n";
-                    break;
+            if(device.has(sycl::aspect::host_debuggable))
+                std::cout << "\t* debuggable using standard debuggers\n";
+#            endif
+#        endif
 
-                case sycl::aspect::fp16:
-                    std::cout << "\t* supports sycl::half precision\n";
-                    break;
+            if(device.has(sycl::aspect::fp16))
+                std::cout << "\t* supports sycl::half precision\n";
 
-                case sycl::aspect::fp64:
-                    std::cout << "\t* supports double precision\n";
-                    break;
+            if(device.has(sycl::aspect::fp64))
+                std::cout << "\t* supports double precision\n";
 
-                case sycl::aspect::atomic64:
-                    std::cout << "\t* supports 64-bit atomics\n";
-                    break;
+            if(device.has(sycl::aspect::atomic64))
+                std::cout << "\t* supports 64-bit atomics\n";
 
-                case sycl::aspect::image:
-                    std::cout << "\t* supports images\n";
-                    break;
+            if(device.has(sycl::aspect::image))
+                std::cout << "\t* supports images\n";
 
-                case sycl::aspect::online_compiler:
-                    std::cout << "\t* supports online compilation of device code\n";
-                    break;
+            if(device.has(sycl::aspect::online_compiler))
+                std::cout << "\t* supports online compilation of device code\n";
 
-                case sycl::aspect::online_linker:
-                    std::cout << "\t* supports online linking of device code\n";
-                    break;
+            if(device.has(sycl::aspect::online_linker))
+                std::cout << "\t* supports online linking of device code\n";
 
-                case sycl::aspect::queue_profiling:
-                    std::cout << "\t* supports queue profiling\n";
-                    break;
+            if(device.has(sycl::aspect::queue_profiling))
+                std::cout << "\t* supports queue profiling\n";
 
-                case sycl::aspect::usm_device_allocations:
-                    std::cout << "\t* supports explicit USM allocations\n";
-                    break;
+            if(device.has(sycl::aspect::usm_device_allocations))
+                std::cout << "\t* supports explicit USM allocations\n";
 
-                case sycl::aspect::usm_host_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
-                    break;
+            if(device.has(sycl::aspect::usm_host_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n";
 
-                case sycl::aspect::usm_atomic_host_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
-                    break;
+            if(device.has(sycl::aspect::usm_atomic_host_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n";
 
-                case sycl::aspect::usm_shared_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
-                    break;
+            if(device.has(sycl::aspect::usm_shared_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n";
 
-                case sycl::aspect::usm_atomic_shared_allocations:
-                    std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
-                    break;
+            if(device.has(sycl::aspect::usm_atomic_shared_allocations))
+                std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n";
 
-                case sycl::aspect::usm_system_allocations:
-                    std::cout << "\t* can access memory allocated by the system allocator\n";
-                    break;
-                }
-            }
+            if(device.has(sycl::aspect::usm_system_allocations))
+                std::cout << "\t* can access memory allocated by the system allocator\n";
 
             std::cout << "Available compute units: " << device.get_info<sycl::info::device::max_compute_units>()
                       << '\n';
@@ -323,7 +303,7 @@ namespace alpaka::trait
             std::cout << "Native ISA vector width (float): "
                       << device.get_info<sycl::info::device::native_vector_width_float>() << '\n';
 
-            if(device.has_aspect(sycl::aspect::fp64))
+            if(device.has(sycl::aspect::fp64))
             {
                 std::cout << "Preferred native vector width (double): "
                           << device.get_info<sycl::info::device::preferred_vector_width_double>() << '\n';
@@ -332,7 +312,7 @@ namespace alpaka::trait
                           << device.get_info<sycl::info::device::native_vector_width_double>() << '\n';
             }
 
-            if(device.has_aspect(sycl::aspect::fp16))
+            if(device.has(sycl::aspect::fp16))
             {
                 std::cout << "Preferred native vector width (half): "
                           << device.get_info<sycl::info::device::preferred_vector_width_half>() << '\n';
@@ -349,7 +329,7 @@ namespace alpaka::trait
             std::cout << "Maximum size of memory object allocation: "
                       << device.get_info<sycl::info::device::max_mem_alloc_size>() << " bytes\n";
 
-            if(device.has_aspect(sycl::aspect::image))
+            if(device.has(sycl::aspect::image))
             {
                 std::cout << "Maximum number of simultaneous image object reads per kernel: "
                           << device.get_info<sycl::info::device::max_read_image_args>() << '\n';
@@ -417,7 +397,7 @@ namespace alpaka::trait
                 find_and_print(sycl::info::fp_config::soft_float);
             };
 
-            if(device.has_aspect(sycl::aspect::fp16))
+            if(device.has(sycl::aspect::fp16))
             {
                 auto const fp16_conf = device.get_info<sycl::info::device::half_fp_config>();
                 print_fp_config("Half", fp16_conf);
@@ -426,7 +406,7 @@ namespace alpaka::trait
             auto const fp32_conf = device.get_info<sycl::info::device::single_fp_config>();
             print_fp_config("Single", fp32_conf);
 
-            if(device.has_aspect(sycl::aspect::fp64))
+            if(device.has(sycl::aspect::fp64))
             {
                 auto const fp64_conf = device.get_info<sycl::info::device::double_fp_config>();
                 print_fp_config("Double", fp64_conf);
@@ -458,7 +438,7 @@ namespace alpaka::trait
                           << device.get_info<sycl::info::device::global_mem_cache_line_size>() << " bytes\n";
 
                 std::cout << "Global memory cache size: "
-                          << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n"
+                          << device.get_info<sycl::info::device::global_mem_cache_size>() / KiB << " KiB\n";
             }
 
             std::cout << "Global memory size: " << device.get_info<sycl::info::device::global_mem_size>() / MiB
@@ -516,6 +496,11 @@ namespace alpaka::trait
                     case sycl::memory_order::seq_cst:
                         std::cout << "seq_cst";
                         break;
+#        if defined(BOOST_COMP_ICPX)
+                    // Stop icpx from complaining about its own internals.
+                    case sycl::memory_order::__consume_unsupported:
+                        break;
+#        endif
                     }
                     std::cout << ", ";
                 }
@@ -526,9 +511,14 @@ namespace alpaka::trait
             auto const mem_orders = device.get_info<sycl::info::device::atomic_memory_order_capabilities>();
             print_memory_orders(mem_orders);
 
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // Not implemented in oneAPI < 2023.2.0
             std::cout << "Supported memory orderings for sycl::atomic_fence: ";
             auto const fence_orders = device.get_info<sycl::info::device::atomic_fence_order_capabilities>();
             print_memory_orders(fence_orders);
+#            endif
+#        endif
 
             auto print_memory_scopes = [](std::vector<sycl::memory_scope> const& mem_scopes)
             {
@@ -565,9 +555,14 @@ namespace alpaka::trait
             auto const mem_scopes = device.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
             print_memory_scopes(mem_scopes);
 
+#        if defined(BOOST_COMP_ICPX)
+#            if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0)
+            // Not implemented in oneAPI < 2023.2.0
             std::cout << "Supported memory scopes for sycl::atomic_fence: ";
             auto const fence_scopes = device.get_info<sycl::info::device::atomic_fence_scope_capabilities>();
             print_memory_scopes(fence_scopes);
+#            endif
+#        endif
 
             std::cout << "Device timer resolution: "
                       << device.get_info<sycl::info::device::profiling_timer_resolution>() << " ns\n";
@@ -607,6 +602,11 @@ namespace alpaka::trait
                         std::cout << "by affinity domain";
                         has_affinity_domains = true;
                         break;
+#        if defined(BOOST_COMP_ICPX)
+                    case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                        std::cout << "by compute slice (Intel extension; deprecated)";
+                        break;
+#        endif
                     }
                     std::cout << ", ";
                 }
@@ -671,6 +671,12 @@ namespace alpaka::trait
                 case sycl::info::partition_property::partition_by_affinity_domain:
                     std::cout << "partitioned by affinity domain";
                     break;
+
+#        if defined(BOOST_COMP_ICPX)
+                case sycl::info::partition_property::ext_intel_partition_by_cslice:
+                    std::cout << "partitioned by compute slice (Intel extension; deprecated)";
+                    break;
+#        endif
                 }
                 std::cout << '\n';
 
diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
index 4c54e6c6ec1c..9c0e6db1eb97 100644
--- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp
+++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Jan Stephan, Andrea Bocci
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,19 +12,13 @@
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_GPU)
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
     namespace detail
     {
-        // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the
-        // linker anyway.
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wweak-vtables"
-#    endif
-        struct IntelGpuSelector final
+        struct IntelGpuSelector
         {
             auto operator()(sycl::device const& dev) const -> int
             {
@@ -34,9 +28,6 @@ namespace alpaka
                 return is_intel_gpu ? 1 : -1;
             }
         };
-#    if BOOST_COMP_CLANG
-#        pragma clang diagnostic pop
-#    endif
     } // namespace detail
 
     //! The SYCL device manager.
diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
similarity index 59%
rename from include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
rename to include/alpaka/queue/QueueCpuSyclBlocking.hpp
index 844375913d95..3d561733ecc5 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/queue/QueueGenericSyclBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclIntelBlocking = QueueGenericSyclBlocking<DevCpuSyclIntel>;
+    using QueueCpuSyclBlocking = QueueGenericSyclBlocking<DevCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
similarity index 59%
rename from include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
rename to include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
index 77f20c1acbba..c75f5be45229 100644
--- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp
+++ b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp
@@ -1,17 +1,17 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-#include "alpaka/dev/DevCpuSyclIntel.hpp"
+#include "alpaka/dev/DevCpuSycl.hpp"
 #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp"
 
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU)
 
 namespace alpaka
 {
-    using QueueCpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking<DevCpuSyclIntel>;
+    using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking<DevCpuSycl>;
 }
 
 #endif
diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
index 297b29732f24..9b08f3de55ca 100644
--- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
+++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp
@@ -1,13 +1,13 @@
-/* Copyright 2022 Jan Stephan, Antonio Di Pilato
+/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
-
 #include "alpaka/dev/Traits.hpp"
 #include "alpaka/event/Traits.hpp"
 #include "alpaka/queue/Traits.hpp"
+#include "alpaka/traits/Traits.hpp"
 #include "alpaka/wait/Traits.hpp"
 
 #include <algorithm>
@@ -21,7 +21,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::detail
 {
@@ -126,19 +126,24 @@ namespace alpaka::detail
                 clean_dependencies();
 
                 // Execute task
-                m_last_event = m_queue.submit(
-                    [this, &task](sycl::handler& cgh)
-                    {
-                        if(!m_dependencies.empty())
-                            cgh.depends_on(m_dependencies);
-
-                        if constexpr(is_sycl_kernel<TTask>) // Kernel
-                            task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally
-                        else if constexpr(is_sycl_task<TTask>) // Copy / Fill
-                            task(cgh); // Will call cgh.{copy, fill} internally
-                        else // Host
-                            cgh.host_task(task);
-                    });
+                if constexpr(is_sycl_task<TTask> && !is_sycl_kernel<TTask>) // Copy / Fill
+                {
+                    m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally
+                }
+                else
+                {
+                    m_last_event = m_queue.submit(
+                        [this, &task](sycl::handler& cgh)
+                        {
+                            if(!m_dependencies.empty())
+                                cgh.depends_on(m_dependencies);
+
+                            if constexpr(is_sycl_kernel<TTask>) // Kernel
+                                task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally
+                            else // Host
+                                cgh.host_task(task);
+                        });
+                }
 
                 m_dependencies.clear();
             }
@@ -167,16 +172,16 @@ namespace alpaka::detail
     public:
         QueueGenericSyclBase(TDev const& dev)
             : m_dev{dev}
-            , m_impl{std::make_shared<detail::QueueGenericSyclImpl>(
+            , m_spQueueImpl{std::make_shared<detail::QueueGenericSyclImpl>(
                   dev.getNativeHandle().second,
                   dev.getNativeHandle().first)}
         {
-            m_dev.m_impl->register_queue(m_impl);
+            m_dev.m_impl->register_queue(m_spQueueImpl);
         }
 
         friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
         {
-            return (lhs.m_dev == rhs.m_dev) && (lhs.m_impl == rhs.m_impl);
+            return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl);
         }
 
         friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool
@@ -186,11 +191,11 @@ namespace alpaka::detail
 
         [[nodiscard]] auto getNativeHandle() const noexcept
         {
-            return m_impl->getNativeHandle();
+            return m_spQueueImpl->getNativeHandle();
         }
 
         TDev m_dev;
-        std::shared_ptr<detail::QueueGenericSyclImpl> m_impl;
+        std::shared_ptr<detail::QueueGenericSyclImpl> m_spQueueImpl;
     };
 } // namespace alpaka::detail
 
@@ -234,7 +239,7 @@ namespace alpaka::trait
         static auto enqueue(detail::QueueGenericSyclBase<TDev, TBlocking>& queue, TTask const& task) -> void
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_impl->template enqueue<TBlocking>(task);
+            queue.m_spQueueImpl->template enqueue<TBlocking>(task);
         }
     };
 
@@ -245,7 +250,7 @@ namespace alpaka::trait
         static auto empty(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> bool
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            return queue.m_impl->empty();
+            return queue.m_spQueueImpl->empty();
         }
     };
 
@@ -259,7 +264,7 @@ namespace alpaka::trait
         static auto currentThreadWaitFor(detail::QueueGenericSyclBase<TDev, TBlocking> const& queue) -> void
         {
             ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
-            queue.m_impl->wait();
+            queue.m_spQueueImpl->wait();
         }
     };
 
diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp
new file mode 100644
index 000000000000..6a85a3a79a5a
--- /dev/null
+++ b/include/alpaka/rand/RandGenericSycl.hpp
@@ -0,0 +1,198 @@
+/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/core/BoostPredef.hpp"
+#include "alpaka/core/Concepts.hpp"
+#include "alpaka/dev/DevGenericSycl.hpp"
+#include "alpaka/rand/Traits.hpp"
+
+#ifdef ALPAKA_ACC_SYCL_ENABLED
+
+// Backend specific imports.
+#    include <sycl/sycl.hpp>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic push
+#        pragma clang diagnostic ignored "-Wcast-align"
+#        pragma clang diagnostic ignored "-Wcast-qual"
+#        pragma clang diagnostic ignored "-Wextra-semi"
+#        pragma clang diagnostic ignored "-Wfloat-equal"
+#        pragma clang diagnostic ignored "-Wold-style-cast"
+#        pragma clang diagnostic ignored "-Wreserved-identifier"
+#        pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#        pragma clang diagnostic ignored "-Wsign-compare"
+#        pragma clang diagnostic ignored "-Wundef"
+#    endif
+#    include <oneapi/mkl/rng.hpp>
+
+#    include <oneapi/dpl/random>
+#    if BOOST_COMP_CLANG
+#        pragma clang diagnostic pop
+#    endif
+
+#    include <type_traits>
+
+namespace alpaka::rand
+{
+    //! The SYCL rand implementation.
+    template<typename TDim>
+    struct RandGenericSycl : concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
+    {
+        explicit RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_rand{my_item}
+        {
+        }
+
+        sycl::nd_item<TDim::value> m_item_rand;
+    };
+
+#    if !defined(ALPAKA_HOST_ONLY)
+    namespace distribution::sycl_rand
+    {
+        //! The SYCL random number floating point normal distribution.
+        template<typename T>
+        struct NormalReal;
+
+        //! The SYCL random number uniform distribution.
+        template<typename T>
+        struct Uniform;
+    } // namespace distribution::sycl_rand
+
+    namespace engine::sycl_rand
+    {
+        //! The SYCL linear congruential random number generator engine.
+        template<typename TDim>
+        class Minstd
+        {
+        public:
+            // After calling this constructor the instance is not valid initialized and
+            // need to be overwritten with a valid object
+            Minstd() = default;
+
+            Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
+            {
+                oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id());
+                rng_engine = engine;
+            }
+
+        private:
+            template<typename T>
+            friend struct distribution::sycl_rand::NormalReal;
+            template<typename T>
+            friend struct distribution::sycl_rand::Uniform;
+
+            oneapi::dpl::minstd_rand rng_engine;
+
+        public:
+            using result_type = float;
+
+            ALPAKA_FN_HOST_ACC static result_type min()
+            {
+                return std::numeric_limits<result_type>::min();
+            }
+            ALPAKA_FN_HOST_ACC static result_type max()
+            {
+                return std::numeric_limits<result_type>::max();
+            }
+            result_type operator()()
+            {
+                oneapi::dpl::uniform_real_distribution<float> distr;
+                return distr(rng_engine);
+            }
+        };
+    } // namespace engine::sycl_rand
+
+    namespace distribution::sycl_rand
+    {
+
+        //! The SYCL random number double normal distribution.
+        template<typename F>
+        struct NormalReal
+        {
+            static_assert(std::is_floating_point_v<F>);
+
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> F
+            {
+                oneapi::dpl::normal_distribution<F> distr;
+                return distr(engine.rng_engine);
+            }
+        };
+
+        //! The SYCL random number float uniform distribution.
+        template<typename T>
+        struct Uniform
+        {
+            static_assert(std::is_floating_point_v<T> || std::is_unsigned_v<T>);
+
+            template<typename TEngine>
+            auto operator()(TEngine& engine) -> T
+            {
+                if constexpr(std::is_floating_point_v<T>)
+                {
+                    oneapi::dpl::uniform_real_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
+                else
+                {
+                    oneapi::dpl::uniform_int_distribution<T> distr;
+                    return distr(engine.rng_engine);
+                }
+            }
+        };
+    } // namespace distribution::sycl_rand
+
+    namespace distribution::trait
+    {
+        //! The SYCL random number float normal distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateNormalReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createNormalReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::NormalReal<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number float uniform distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateUniformReal<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_floating_point_v<T>>>
+        {
+            static auto createUniformReal(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
+            {
+                return {};
+            }
+        };
+
+        //! The SYCL random number integer uniform distribution get trait specialization.
+        template<typename TDim, typename T>
+        struct CreateUniformUint<RandGenericSycl<TDim>, T, std::enable_if_t<std::is_integral_v<T>>>
+        {
+            static auto createUniformUint(RandGenericSycl<TDim> const& /*rand*/) -> sycl_rand::Uniform<T>
+            {
+                return {};
+            }
+        };
+    } // namespace distribution::trait
+
+    namespace engine::trait
+    {
+        //! The SYCL random number default generator get trait specialization.
+        template<typename TDim>
+        struct CreateDefault<RandGenericSycl<TDim>>
+        {
+            static auto createDefault(
+                RandGenericSycl<TDim> const& rand,
+                std::uint32_t const& seed = 0,
+                std::uint32_t const& /* subsequence */ = 0,
+                std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd<TDim>
+            {
+                return {rand, seed};
+            }
+        };
+    } // namespace engine::trait
+#    endif
+} // namespace alpaka::rand
+
+#endif
diff --git a/include/alpaka/standalone/CpuSyclIntel.hpp b/include/alpaka/standalone/CpuSycl.hpp
similarity index 87%
rename from include/alpaka/standalone/CpuSyclIntel.hpp
rename to include/alpaka/standalone/CpuSycl.hpp
index e42b64a2097e..fbdb5c2c481e 100644
--- a/include/alpaka/standalone/CpuSyclIntel.hpp
+++ b/include/alpaka/standalone/CpuSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp
index 253cb477fe9b..39545e764923 100644
--- a/include/alpaka/test/Check.hpp
+++ b/include/alpaka/test/Check.hpp
@@ -1,31 +1,19 @@
-/* Copyright 2022 Benjamin Worpitz, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
 #pragma once
 
+#include "alpaka/core/Sycl.hpp"
+
 #include <cstdio>
 
-// TODO: SYCL doesn't have a way to detect if we're looking at device or host code. This needs a workaround so that
-// SYCL and other back-ends are compatible.
-#ifdef ALPAKA_ACC_SYCL_ENABLED
-#    define ALPAKA_CHECK(success, expression)                                                                         \
-        do                                                                                                            \
-        {                                                                                                             \
-            if(!(expression))                                                                                         \
-            {                                                                                                         \
-                acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n";                               \
-                success = false;                                                                                      \
-            }                                                                                                         \
-        } while(0)
-#else
-#    define ALPAKA_CHECK(success, expression)                                                                         \
-        do                                                                                                            \
+#define ALPAKA_CHECK(success, expression)                                                                             \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if(!(expression))                                                                                             \
         {                                                                                                             \
-            if(!(expression))                                                                                         \
-            {                                                                                                         \
-                printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                         \
-                success = false;                                                                                      \
-            }                                                                                                         \
-        } while(0)
-#endif
+            printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression);                                             \
+            success = false;                                                                                          \
+        }                                                                                                             \
+    } while(0)
diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp
index 742b40fd8313..62c1e6f7d2c2 100644
--- a/include/alpaka/test/acc/TestAccs.hpp
+++ b/include/alpaka/test/acc/TestAccs.hpp
@@ -79,10 +79,10 @@ namespace alpaka::test
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_CPU)
         template<typename TDim, typename TIdx>
-        using AccCpuSyclIntelIfAvailableElseInt = alpaka::AccCpuSyclIntel<TDim, TIdx>;
+        using AccCpuSyclIfAvailableElseInt = alpaka::AccCpuSycl<TDim, TIdx>;
 #else
         template<typename TDim, typename TIdx>
-        using AccCpuSyclIntelIfAvailableElseInt = int;
+        using AccCpuSyclIfAvailableElseInt = int;
 #endif
 #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_FPGA)
         template<typename TDim, typename TIdx>
@@ -109,7 +109,7 @@ namespace alpaka::test
             AccCpuOmp2ThreadsIfAvailableElseInt<TDim, TIdx>,
             AccGpuCudaRtIfAvailableElseInt<TDim, TIdx>,
             AccGpuHipRtIfAvailableElseInt<TDim, TIdx>,
-            AccCpuSyclIntelIfAvailableElseInt<TDim, TIdx>,
+            AccCpuSyclIfAvailableElseInt<TDim, TIdx>,
             AccFpgaSyclIntelIfAvailableElseInt<TDim, TIdx>,
             AccGpuSyclIntelIfAvailableElseInt<TDim, TIdx>>;
     } // namespace detail
diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp
index 6350697fe7b3..395c97e5dcf3 100644
--- a/include/alpaka/test/dim/TestDims.hpp
+++ b/include/alpaka/test/dim/TestDims.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
+/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp
index d465555c0dc9..7016f45e6c45 100644
--- a/include/alpaka/test/event/EventHostManualTrigger.hpp
+++ b/include/alpaka/test/event/EventHostManualTrigger.hpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci,
+ *                Bernhard Manfred Gruber
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp
index 07d8495da8df..7cb3492cfc45 100644
--- a/include/alpaka/test/queue/Queue.hpp
+++ b/include/alpaka/test/queue/Queue.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -86,23 +86,23 @@ namespace alpaka::test
 #        ifdef ALPAKA_SYCL_ONEAPI_CPU
         //! The default queue type trait specialization for the Intel CPU device.
         template<>
-        struct DefaultQueueType<alpaka::DevCpuSyclIntel>
+        struct DefaultQueueType<alpaka::DevCpuSycl>
         {
 #            if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL)
-            using type = alpaka::QueueCpuSyclIntelBlocking;
+            using type = alpaka::QueueCpuSyclBlocking;
 #            else
-            using type = alpaka::QueueCpuSyclIntelNonBlocking;
+            using type = alpaka::QueueCpuSyclNonBlocking;
 #            endif
         };
 
         template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclIntelBlocking>
+        struct IsBlockingQueue<alpaka::QueueCpuSyclBlocking>
         {
             static constexpr auto value = true;
         };
 
         template<>
-        struct IsBlockingQueue<alpaka::QueueCpuSyclIntelNonBlocking>
+        struct IsBlockingQueue<alpaka::QueueCpuSyclNonBlocking>
         {
             static constexpr auto value = false;
         };
@@ -180,8 +180,8 @@ namespace alpaka::test
 #    ifdef ALPAKA_SYCL_BACKEND_ONEAPI
 #        ifdef ALPAKA_SYCL_ONEAPI_CPU
         ,
-        std::tuple<alpaka::DevCpuSyclIntel, alpaka::QueueCpuSyclIntelBlocking>,
-        std::tuple<alpaka::DevCpuSyclIntel, alpaka::QueueCpuSyclIntelNonBlocking>
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclBlocking>,
+        std::tuple<alpaka::DevCpuSycl, alpaka::QueueCpuSyclNonBlocking>
 #        endif
 #        ifdef ALPAKA_SYCL_ONEAPI_FPGA
         ,
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
index 33ac1884deb2..b2b6aa8328b5 100644
--- a/include/alpaka/warp/WarpGenericSycl.hpp
+++ b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,7 +10,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka::warp
 {
@@ -19,11 +19,11 @@ namespace alpaka::warp
     class WarpGenericSycl : public concepts::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
     {
     public:
-        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item{my_item}
+        WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
         {
         }
 
-        sycl::nd_item<TDim::value> m_item;
+        sycl::nd_item<TDim::value> m_item_warp;
     };
 } // namespace alpaka::warp
 
@@ -34,21 +34,30 @@ namespace alpaka::warp::trait
     {
         static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             // SYCL sub-groups are always 1D
-            return static_cast<std::int32_t>(sub_group.get_local_linear_range());
+            return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
         }
     };
 
     template<typename TDim>
     struct Activemask<warp::WarpGenericSycl<TDim>>
     {
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
         static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
         {
             // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code
             // regions anyway we return the full mask.
-            auto const sub_group = warp.m_item.get_sub_group();
-            return sycl::ext::oneapi::group_ballot(sub_group, true);
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true);
+            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+            // Restrict to warpSize <= 32 for now.
+            std::uint32_t bits = 0;
+            mask.extract_bits(bits);
+            return bits;
         }
     };
 
@@ -57,7 +66,7 @@ namespace alpaka::warp::trait
     {
         static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
         }
     };
@@ -67,7 +76,7 @@ namespace alpaka::warp::trait
     {
         static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
+            auto const sub_group = warp.m_item_warp.get_sub_group();
             return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
         }
     };
@@ -75,10 +84,19 @@ namespace alpaka::warp::trait
     template<typename TDim>
     struct Ballot<warp::WarpGenericSycl<TDim>>
     {
-        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate)
+        // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+        // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+        // Restrict to warpSize <= 32 for now.
+        static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
         {
-            auto const sub_group = warp.m_item.get_sub_group();
-            return sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
+            auto const sub_group = warp.m_item_warp.get_sub_group();
+            auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
+            // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
+            // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
+            // Restrict to warpSize <= 32 for now.
+            std::uint32_t bits = 0;
+            mask.extract_bits(bits);
+            return bits;
         }
     };
 
@@ -88,21 +106,20 @@ namespace alpaka::warp::trait
         template<typename T>
         static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
         {
+            ALPAKA_ASSERT_OFFLOAD(width > 0);
+            ALPAKA_ASSERT_OFFLOAD(srcLane < width);
+            ALPAKA_ASSERT_OFFLOAD(srcLane >= 0);
+
             /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
                subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.
 
                Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
                The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
                first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
-            auto const actual_group = warp.m_item.get_sub_group();
-            auto const actual_item_id = actual_group.get_local_linear_id();
-
-            auto const assumed_group_id = actual_item_id / width;
-            auto const assumed_item_id = actual_item_id % width;
-
-            auto const assumed_src_id = static_cast<std::size_t>(srcLane % width);
-            auto const actual_src_id = assumed_src_id + assumed_group_id * width;
-
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            auto const actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
+            auto const actual_group_id = actual_item_id / width;
+            auto const actual_src_id = static_cast<std::size_t>(srcLane + actual_group_id * width);
             auto const src = sycl::id<1>{actual_src_id};
 
             return sycl::select_from_group(actual_group, value, src);
diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
index c6d1c114a3c9..26e00750e42d 100644
--- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp
+++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Jan Stephan
+/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -10,7 +10,7 @@
 
 #ifdef ALPAKA_ACC_SYCL_ENABLED
 
-#    include <CL/sycl.hpp>
+#    include <sycl/sycl.hpp>
 
 namespace alpaka
 {
@@ -18,17 +18,19 @@ namespace alpaka
     template<typename TDim, typename TIdx>
     class WorkDivGenericSycl : public concepts::Implements<ConceptWorkDiv, WorkDivGenericSycl<TDim, TIdx>>
     {
+        static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero.");
+
     public:
         using WorkDivBase = WorkDivGenericSycl;
 
         WorkDivGenericSycl(Vec<TDim, TIdx> const& threadElemExtent, sycl::nd_item<TDim::value> work_item)
             : m_threadElemExtent{threadElemExtent}
-            , my_item{work_item}
+            , m_item_workdiv{work_item}
         {
         }
 
         Vec<TDim, TIdx> const& m_threadElemExtent;
-        sycl::nd_item<TDim::value> my_item;
+        sycl::nd_item<TDim::value> m_item_workdiv;
     };
 } // namespace alpaka
 
@@ -55,20 +57,22 @@ namespace alpaka::trait
         //! \return The number of blocks in each dimension of the grid.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
-            if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(2)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_group_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_group_range(0))};
             }
         }
     };
@@ -80,20 +84,22 @@ namespace alpaka::trait
         //! \return The number of threads in each dimension of a block.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
-            if constexpr(TDim::value == 1)
-                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+            if constexpr(TDim::value == 0)
+                return Vec<TDim, TIdx>{};
+            else if constexpr(TDim::value == 1)
+                return Vec<TDim, TIdx>{static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             else if constexpr(TDim::value == 2)
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             }
             else
             {
                 return Vec<TDim, TIdx>{
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(2)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(1)),
-                    static_cast<TIdx>(workDiv.my_item.get_local_range(0))};
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(2)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(1)),
+                    static_cast<TIdx>(workDiv.m_item_workdiv.get_local_range(0))};
             }
         }
     };
@@ -102,7 +108,7 @@ namespace alpaka::trait
     template<typename TDim, typename TIdx>
     struct GetWorkDiv<WorkDivGenericSycl<TDim, TIdx>, origin::Thread, unit::Elems>
     {
-        //! \return The number of blocks in each dimension of the grid.
+        //! \return The number of elements in each dimension of the thread.
         static auto getWorkDiv(WorkDivGenericSycl<TDim, TIdx> const& workDiv) -> Vec<TDim, TIdx>
         {
             return workDiv.m_threadElemExtent;
diff --git a/test/unit/acc/src/AccTagTest.cpp b/test/unit/acc/src/AccTagTest.cpp
index f2d5f58ee45e..741f2e17385b 100644
--- a/test/unit/acc/src/AccTagTest.cpp
+++ b/test/unit/acc/src/AccTagTest.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Simeon Ehrig, Jan Stephan
+/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -23,7 +23,7 @@ using TagList = std::tuple<
     alpaka::TagCpuOmp2Threads,
     alpaka::TagGpuCudaRt,
     alpaka::TagGpuHipRt,
-    alpaka::TagCpuSyclIntel,
+    alpaka::TagCpuSycl,
     alpaka::TagFpgaSyclIntel,
     alpaka::TagGpuSyclIntel>;
 
@@ -35,7 +35,7 @@ using AccToTagMap = std::tuple<
     std::pair<alpaka::test::detail::AccCpuOmp2ThreadsIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuOmp2Threads>,
     std::pair<alpaka::test::detail::AccGpuCudaRtIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuCudaRt>,
     std::pair<alpaka::test::detail::AccGpuHipRtIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuHipRt>,
-    std::pair<alpaka::test::detail::AccCpuSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuSyclIntel>,
+    std::pair<alpaka::test::detail::AccCpuSyclIfAvailableElseInt<Dim, Idx>, alpaka::TagCpuSycl>,
     std::pair<alpaka::test::detail::AccFpgaSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagFpgaSyclIntel>,
     std::pair<alpaka::test::detail::AccGpuSyclIntelIfAvailableElseInt<Dim, Idx>, alpaka::TagGpuSyclIntel>>;
 
diff --git a/test/unit/atomic/src/AtomicTest.cpp b/test/unit/atomic/src/AtomicTest.cpp
index b04fd281af6c..467dab651741 100644
--- a/test/unit/atomic/src/AtomicTest.cpp
+++ b/test/unit/atomic/src/AtomicTest.cpp
@@ -1,4 +1,5 @@
-/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Antonio Di Pilato
+/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Sergei Bastrakov, René Widera, Jan Stephan,
+ *                Bernhard Manfred Gruber, Antonio Di Pilato, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
diff --git a/test/unit/mem/view/src/ViewStaticAccMem.cpp b/test/unit/mem/view/src/ViewStaticAccMem.cpp
index f26a88fdd8a0..ff8f0e48b245 100644
--- a/test/unit/mem/view/src/ViewStaticAccMem.cpp
+++ b/test/unit/mem/view/src/ViewStaticAccMem.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -16,6 +16,8 @@ using Elem = std::uint32_t;
 using Dim = alpaka::DimInt<2u>;
 using Idx = std::uint32_t;
 
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 // These forward declarations are only necessary when you want to access those variables
 // from a different compilation unit and should be moved to a common header.
 // Here they are used to silence clang`s -Wmissing-variable-declarations warning
@@ -40,10 +42,13 @@ struct StaticDeviceMemoryTestKernel
     }
 };
 
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 using TestAccs = alpaka::test::EnabledAccs<Dim, Idx>;
 
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs)
 {
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
@@ -75,8 +80,16 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc
 
         REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized)));
     }
+
+#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
+    WARN("The SYCL backend does not support global device variables.");
+
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
 }
 
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 // These forward declarations are only necessary when you want to access those variables
 // from a different compilation unit and should be moved to a common header.
 // Here they are used to silence clang`s -Wmissing-variable-declarations warning
@@ -84,8 +97,11 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc
 extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2];
 
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
 TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs)
 {
+#if !defined(ALPAKA_ACC_SYCL_ENABLED)
     using Acc = TestType;
     using DevAcc = alpaka::Dev<Acc>;
 
@@ -117,4 +133,10 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test
 
         REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized)));
     }
+
+#else // !defined(ALPAKA_ACC_SYCL_ENABLED)
+
+    WARN("The SYCL backend does not support global device constants.");
+
+#endif // !defined(ALPAKA_ACC_SYCL_ENABLED)
 }
diff --git a/test/unit/rand/src/RandTest.cpp b/test/unit/rand/src/RandTest.cpp
index 7bea6e930ea6..05311d438965 100644
--- a/test/unit/rand/src/RandTest.cpp
+++ b/test/unit/rand/src/RandTest.cpp
@@ -1,5 +1,5 @@
 /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber,
- *                Sergei Bastrakov
+ *                Sergei Bastrakov, Andrea Bocci
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -60,7 +60,7 @@ class RandTestKernel
         auto genDefault = alpaka::rand::engine::createDefault(acc, 12345u, 6789u);
         genNumbers(acc, success, genDefault);
 
-#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED)
+#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED)
         // TODO: These ifdefs are wrong: They will reduce the test to the
         // smallest common denominator from all enabled backends
         // std::random_device
diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp
index afc492ffc9a7..8038606a4116 100644
--- a/test/unit/warp/src/Activemask.cpp
+++ b/test/unit/warp/src/Activemask.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct ActivemaskSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct ActivemaskMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -53,6 +54,12 @@ struct ActivemaskMultipleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ActivemaskMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -68,7 +75,7 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
         if(scalar)
         {
             alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(4));
-            REQUIRE(fixture(ActivemaskSingleThreadWarpTestKernel{}));
+            CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{}));
         }
         else
         {
@@ -80,9 +87,41 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            ActivemaskMultipleThreadWarpTestKernel kernel;
-            for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
-                REQUIRE(fixture(kernel, inactiveThreadIdx));
+            if(warpExtent == 4)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 8)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 16)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 32)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx));
+                }
+            }
+            else if(warpExtent == 64)
+            {
+                for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++)
+                {
+                    CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx));
+                }
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp
index 4ea233e3179b..3a61553496be 100644
--- a/test/unit/warp/src/All.cpp
+++ b/test/unit/warp/src/All.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct AllSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct AllMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -57,6 +58,12 @@ struct AllMultipleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<AllMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -84,8 +91,26 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            AllMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(AllMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp
index 72b247ecfe63..0f5a059d1c48 100644
--- a/test/unit/warp/src/Any.cpp
+++ b/test/unit/warp/src/Any.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -25,6 +25,7 @@ struct AnySingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct AnyMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -57,6 +58,12 @@ struct AnyMultipleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<AnyMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -84,8 +91,26 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            AnyMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp
index cf5b59d79445..0525928c275c 100644
--- a/test/unit/warp/src/Ballot.cpp
+++ b/test/unit/warp/src/Ballot.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -26,6 +26,7 @@ struct BallotSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct BallotMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -65,6 +66,12 @@ struct BallotMultipleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<BallotMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -92,8 +99,26 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            BallotMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/test/unit/warp/src/GetSize.cpp b/test/unit/warp/src/GetSize.cpp
index 5dbbaa018bba..d8c2b5ce3a84 100644
--- a/test/unit/warp/src/GetSize.cpp
+++ b/test/unit/warp/src/GetSize.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan
+/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -12,6 +12,7 @@
 
 #include <cstdint>
 
+template<std::uint32_t TWarpSize>
 struct GetSizeTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -22,6 +23,11 @@ struct GetSizeTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<GetSizeTestKernel<TWarpSize>, TAcc> : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -37,6 +43,26 @@ TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs)
         [](std::size_t ws)
         {
             alpaka::test::KernelExecutionFixture<Acc> fixture(alpaka::Vec<Dim, Idx>::all(8));
-            return fixture(GetSizeTestKernel{}, static_cast<std::int32_t>(ws));
+            if(ws == 4)
+            {
+                return fixture(GetSizeTestKernel<4>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 8)
+            {
+                return fixture(GetSizeTestKernel<8>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 16)
+            {
+                return fixture(GetSizeTestKernel<16>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 32)
+            {
+                return fixture(GetSizeTestKernel<32>{}, static_cast<std::int32_t>(ws));
+            }
+            else if(ws == 64)
+            {
+                return fixture(GetSizeTestKernel<64>{}, static_cast<std::int32_t>(ws));
+            }
+            return fixture(GetSizeTestKernel<0>{}, static_cast<std::int32_t>(ws));
         }));
 }
diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp
index 63f5b81a5f77..4a35ba4480a3 100644
--- a/test/unit/warp/src/Shfl.cpp
+++ b/test/unit/warp/src/Shfl.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2023 David M. Rogers, Jan Stephan
+/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -35,6 +35,7 @@ struct ShflSingleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize>
 struct ShflMultipleThreadWarpTestKernel
 {
     ALPAKA_NO_HOST_ACC_WARNING
@@ -87,6 +88,12 @@ struct ShflMultipleThreadWarpTestKernel
     }
 };
 
+template<std::uint32_t TWarpSize, typename TAcc>
+struct alpaka::trait::WarpSize<ShflMultipleThreadWarpTestKernel<TWarpSize>, TAcc>
+    : std::integral_constant<std::uint32_t, TWarpSize>
+{
+};
+
 TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
 {
     using Acc = TestType;
@@ -114,8 +121,26 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs)
             auto const threadElementExtent = alpaka::Vec<Dim, Idx>::ones();
             auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent};
             auto fixture = ExecutionFixture{workDiv};
-            ShflMultipleThreadWarpTestKernel kernel;
-            REQUIRE(fixture(kernel));
+            if(warpExtent == 4)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<4>{}));
+            }
+            else if(warpExtent == 8)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{}));
+            }
+            else if(warpExtent == 16)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<16>{}));
+            }
+            else if(warpExtent == 32)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{}));
+            }
+            else if(warpExtent == 64)
+            {
+                REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<64>{}));
+            }
         }
     }
 }
diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt
index b061c9b0678a..ae1347167e8e 100644
--- a/thirdParty/CMakeLists.txt
+++ b/thirdParty/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright 2023 Jan Stephan
+# Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci
 # SPDX-License-Identifier: MPL-2.0
 #