diff --git a/README_SYCL.md b/README_SYCL.md index eead8a8a1acc..ccac62293474 100644 --- a/README_SYCL.md +++ b/README_SYCL.md @@ -53,9 +53,9 @@ To enable device-side printing add the following compiler flags: * `-DALPAKA_SYCL_IOSTREAM_ENABLED`: to enable device-side printing. * `-DALPAKA_SYCL_IOSTREAM_KIB=`: `` (without the brackets) defines the kibibytes per block to be reserved for device-side printing. `` cannot exceed the amount of shared memory per block. -### Building for Intel CPUs +### Building for x86 64-bit CPUs -1. `#include ` in your C++ code. +1. `#include ` in your C++ code. 2. Add the following flags: * `-fsycl-targets=spir64_x86_64` (compiler and linker): to enable CPU compilation. Note: If you are using multiple SYCL hardware targets (like CPU and GPU) separate them by comma here. * `-Xsycl-target-backend=spir64_x86_64 "-march="` (linker): to choose the Intel ISA to compile for. Check the output of `opencl-aot --help` and look for the possible values of the `--march` flag. @@ -84,7 +84,7 @@ To enable device-side printing add the following compiler flags: In contrast to the other back-ends the SYCL back-end comes with multiple different accelerators which should be chosen according to your requirements: -* `alpaka::experimental::AccCpuSyclIntel` for targeting Intel CPUs. In contrast to the other CPU back-ends this will be using Intel's OpenCL implementation for CPUs under the hood. +* `alpaka::experimental::AccCpuSycl` for targeting Intel and AMD CPUs. In contrast to the other CPU back-ends this will use Intel's OpenCL implementation for CPUs under the hood. * `alpaka::experimental::AccFpgaSyclIntel` for targeting Intel FPGAs. * `alpaka::experimental::AccGpuSyclIntel` for targeting Intel GPUs. @@ -92,9 +92,13 @@ These can be used interchangeably (some restrictions apply - see below) with the ### Restrictions -* The FPGA back-ends (both vendors) cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation. -* The SYCL back-end currently does not support passing pointers as kernel parameters. Use alpaka's experimental accessors instead. -* The SYCL back-end does not have device-side random number generation. +* The Intel FPGA back-end cannot be used together with the Intel CPU / GPU back-ends. This is because of the different compilation trajectory required for FPGAs and is unlikely to be fixed anytime soon. See [here](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/programming-interface/fpga-flow/why-is-fpga-compilation-different.html) for an explanation. * Similar to the CUDA and HIP back-ends the SYCL back-end only supports up to three kernel dimensions. * Some Intel GPUs do not support the `double` type for device code. alpaka will not check this. -* The FPGA back-end does not support atomics. alpaka will not check this. \ No newline at end of file + You can enable software emulation for `double` precision types with + ```bash + export IGC_EnableDPEmulation=1 + export OverrideDefaultFP64Settings=1 + ``` + See [Intel's FAQ](https://github.com/intel/compute-runtime/blob/master/opencl/doc/FAQ.md#feature-double-precision-emulation-fp64) for more information. +* The FPGA back-end does not support atomics. alpaka will not check this. diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index e4ec50a8cd2b..27776497138d 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -1,5 +1,5 @@ # -# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber +# Copyright 2023 Benjamin Worpitz, Erik Zenker, Axel Hübl, Jan Stephan, René Widera, Jeffrey Kelling, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego # SPDX-License-Identifier: MPL-2.0 # @@ -547,9 +547,6 @@ if(alpaka_ACC_SYCL_ENABLE) # Enable device-side printing to stdout cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF) - if(BUILD_TESTING) - set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE) - endif() alpaka_set_compiler_options(HOST_DEVICE target alpaka "-fsycl") target_link_options(alpaka INTERFACE "-fsycl") @@ -559,7 +556,7 @@ if(alpaka_ACC_SYCL_ENABLE) # Determine SYCL targets set(alpaka_SYCL_ONEAPI_CPU_TARGET "spir64_x86_64") set(alpaka_SYCL_ONEAPI_FPGA_TARGET "spir64_fpga") - set(alpaka_SYCL_ONEAPI_GPU_TARGET "spir64_gen") + set(alpaka_SYCL_ONEAPI_GPU_TARGET ${alpaka_SYCL_ONEAPI_GPU_DEVICES}) if(alpaka_SYCL_ONEAPI_CPU) list(APPEND alpaka_SYCL_TARGETS ${alpaka_SYCL_ONEAPI_CPU_TARGET}) @@ -620,14 +617,13 @@ if(alpaka_ACC_SYCL_ENABLE) if(alpaka_SYCL_ONEAPI_GPU) # Create a drop-down list (in cmake-gui) of valid Intel GPU targets. On the command line the user can specifiy # additional targets, such as ranges: "Gen8-Gen12LP" or lists: "icllp;skl". - set(alpaka_SYCL_ONEAPI_GPU_DEVICES "bdw" CACHE STRING "Intel GPU devices / generations to compile for") + set(alpaka_SYCL_ONEAPI_GPU_DEVICES "intel_gpu_pvc" CACHE STRING "Intel GPU devices / generations to compile for") set_property(CACHE alpaka_SYCL_ONEAPI_GPU_DEVICES - PROPERTY STRINGS "bdw;skl;kbl;cfl;bxt;glk;whl;aml;cml;icllp;lkf;ehl;tgllp;rkl;adl-s;adl-p;dg1;acm-g10;ats-m150;dg2-g10;acm-g11;ats-m75;dg2-g11;acm-g12;dg2-g12;pvc-sdv;pvc;gen11;gen12lp;gen8;gen9;xe;xe-hpc;xe-hpg") + PROPERTY STRINGS "intel_gpu_pvc;intel_gpu_acm_g12;intel_gpu_acm_g11;intel_gpu_acm_g10;intel_gpu_dg1;intel_gpu_adl_n;intel_gpu_adl_p;intel_gpu_rpl_s;intel_gpu_adl_s;intel_gpu_rkl;intel_gpu_tgllp;intel_gpu_icllp;intel_gpu_cml;intel_gpu_aml;intel_gpu_whl;intel_gpu_glk;intel_gpu_apl;intel_gpu_cfl;intel_gpu_kbl;intel_gpu_skl;intel_gpu_bdw") # If the user has given us a list turn all ';' into ',' to pacify the Intel OpenCL compiler. string(REPLACE ";" "," alpaka_SYCL_ONEAPI_GPU_DEVICES "${alpaka_SYCL_ONEAPI_GPU_DEVICES}") target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_GPU") - target_link_options(alpaka INTERFACE "SHELL:-Xsycl-target-backend=${alpaka_SYCL_ONEAPI_GPU_TARGET} \"-device ${alpaka_SYCL_ONEAPI_GPU_DEVICES}\"") endif() #----------------------------------------------------------------------------------------------------------------- diff --git a/example/vectorAdd/src/vectorAdd.cpp b/example/vectorAdd/src/vectorAdd.cpp index e057e359e43d..a67362200189 100644 --- a/example/vectorAdd/src/vectorAdd.cpp +++ b/example/vectorAdd/src/vectorAdd.cpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Luca Ferragina, + * Aurora Perego * SPDX-License-Identifier: ISC */ @@ -76,6 +77,8 @@ auto main() -> int // - AccCpuSerial // using Acc = alpaka::AccCpuSerial; using Acc = alpaka::ExampleDefaultAcc; + using Pltf = alpaka::Pltf; + using DevAcc = alpaka::Dev; std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; // Defines the synchronization behavior of a queue @@ -136,7 +139,7 @@ auto main() -> int } // Allocate 3 buffers on the accelerator - using BufAcc = alpaka::Buf; + using BufAcc = alpaka::Buf; BufAcc bufAccA(alpaka::allocBuf(devAcc, extent)); BufAcc bufAccB(alpaka::allocBuf(devAcc, extent)); BufAcc bufAccC(alpaka::allocBuf(devAcc, extent)); diff --git a/include/alpaka/acc/AccCpuSyclIntel.hpp b/include/alpaka/acc/AccCpuSycl.hpp similarity index 65% rename from include/alpaka/acc/AccCpuSyclIntel.hpp rename to include/alpaka/acc/AccCpuSycl.hpp index f1f53fad45b6..e9e5504bae67 100644 --- a/include/alpaka/acc/AccCpuSyclIntel.hpp +++ b/include/alpaka/acc/AccCpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -9,11 +9,11 @@ #include "alpaka/core/Concepts.hpp" #include "alpaka/core/DemangleTypeNames.hpp" #include "alpaka/core/Sycl.hpp" -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/dev/Traits.hpp" -#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp" +#include "alpaka/kernel/TaskKernelCpuSycl.hpp" #include "alpaka/kernel/Traits.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #include "alpaka/pltf/Traits.hpp" #include "alpaka/vec/Vec.hpp" @@ -23,7 +23,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) -# include +# include namespace alpaka { @@ -31,9 +31,9 @@ namespace alpaka //! //! This accelerator allows parallel kernel execution on a oneAPI-capable Intel CPU target device. template - class AccCpuSyclIntel final + class AccCpuSycl final : public AccGenericSycl - , public concepts::Implements> + , public concepts::Implements> { public: using AccGenericSycl::AccGenericSycl; @@ -44,28 +44,28 @@ namespace alpaka::trait { //! The Intel CPU SYCL accelerator name trait specialization. template - struct GetAccName> + struct GetAccName> { static auto getAccName() -> std::string { - return "AccCpuSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled + ">"; + return "AccCpuSycl<" + std::to_string(TDim::value) + "," + core::demangled + ">"; } }; //! The Intel CPU SYCL accelerator device type trait specialization. template - struct DevType> + struct DevType> { - using type = DevCpuSyclIntel; + using type = DevCpuSycl; }; //! The Intel CPU SYCL accelerator execution task type trait specialization. template - struct CreateTaskKernel, TWorkDiv, TKernelFnObj, TArgs...> + struct CreateTaskKernel, TWorkDiv, TKernelFnObj, TArgs...> { static auto createTaskKernel(TWorkDiv const& workDiv, TKernelFnObj const& kernelFnObj, TArgs&&... args) { - return TaskKernelCpuSyclIntel{ + return TaskKernelCpuSycl{ workDiv, kernelFnObj, std::forward(args)...}; @@ -74,21 +74,21 @@ namespace alpaka::trait //! The Intel CPU SYCL execution task platform type trait specialization. template - struct PltfType> + struct PltfType> { - using type = PltfCpuSyclIntel; + using type = PltfCpuSycl; }; template - struct AccToTag> + struct AccToTag> { - using type = alpaka::TagCpuSyclIntel; + using type = alpaka::TagCpuSycl; }; template - struct TagToAcc + struct TagToAcc { - using type = alpaka::AccCpuSyclIntel; + using type = alpaka::AccCpuSycl; }; } // namespace alpaka::trait diff --git a/include/alpaka/acc/AccFpgaSyclIntel.hpp b/include/alpaka/acc/AccFpgaSyclIntel.hpp index 7ddc7b2b65de..7e84ce6d5e39 100644 --- a/include/alpaka/acc/AccFpgaSyclIntel.hpp +++ b/include/alpaka/acc/AccFpgaSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -22,7 +22,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA) -# include +# include namespace alpaka { @@ -45,7 +45,7 @@ namespace alpaka::trait template struct GetAccName> { - ALPAKA_FN_HOST static auto getAccName() -> std::string + static auto getAccName() -> std::string { return "AccFpgaSyclIntel<" + std::to_string(TDim::value) + "," + core::demangled + ">"; } diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index 211d89a7740f..34400442c31c 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Andrea Bocci, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -15,6 +15,7 @@ #include "alpaka/intrinsic/IntrinsicGenericSycl.hpp" #include "alpaka/math/MathGenericSycl.hpp" #include "alpaka/mem/fence/MemFenceGenericSycl.hpp" +#include "alpaka/rand/RandGenericSycl.hpp" #include "alpaka/warp/WarpGenericSycl.hpp" #include "alpaka/workdiv/WorkDivGenericSycl.hpp" @@ -37,7 +38,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -56,40 +57,17 @@ namespace alpaka , public BlockSyncGenericSycl , public IntrinsicGenericSycl , public MemFenceGenericSycl + , public rand::RandGenericSycl , public warp::WarpGenericSycl { + static_assert(TDim::value > 0, "The SYCL accelerator must have a dimension greater than zero."); + public: AccGenericSycl(AccGenericSycl const&) = delete; AccGenericSycl(AccGenericSycl&&) = delete; auto operator=(AccGenericSycl const&) -> AccGenericSycl& = delete; auto operator=(AccGenericSycl&&) -> AccGenericSycl& = delete; -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - AccGenericSycl( - Vec const& threadElemExtent, - sycl::nd_item work_item, - sycl::local_accessor dyn_shared_acc, - sycl::local_accessor st_shared_acc, - sycl::accessor global_fence_dummy, - sycl::local_accessor local_fence_dummy, - sycl::stream output_stream) - : WorkDivGenericSycl{threadElemExtent, work_item} - , gb::IdxGbGenericSycl{work_item} - , bt::IdxBtGenericSycl{work_item} - , AtomicHierarchy{} - , math::MathGenericSycl{} - , BlockSharedMemDynGenericSycl{dyn_shared_acc} - , BlockSharedMemStGenericSycl{st_shared_acc} - , BlockSyncGenericSycl{work_item} - , IntrinsicGenericSycl{} - , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} - , warp::WarpGenericSycl{work_item} - , cout{output_stream} - { - } - - sycl::stream cout; -# else AccGenericSycl( Vec const& threadElemExtent, sycl::nd_item work_item, @@ -107,10 +85,10 @@ namespace alpaka , BlockSyncGenericSycl{work_item} , IntrinsicGenericSycl{} , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} + , rand::RandGenericSycl{work_item} , warp::WarpGenericSycl{work_item} { } -# endif }; } // namespace alpaka diff --git a/include/alpaka/acc/Tag.hpp b/include/alpaka/acc/Tag.hpp index f950bb03b56e..03a6449a75a9 100644 --- a/include/alpaka/acc/Tag.hpp +++ b/include/alpaka/acc/Tag.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Simeon Ehrig, Jan Stephan +/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -23,7 +23,7 @@ namespace alpaka CREATE_ACC_TAG(TagCpuOmp2Blocks); CREATE_ACC_TAG(TagCpuOmp2Threads); CREATE_ACC_TAG(TagCpuSerial); - CREATE_ACC_TAG(TagCpuSyclIntel); + CREATE_ACC_TAG(TagCpuSycl); CREATE_ACC_TAG(TagCpuTbbBlocks); CREATE_ACC_TAG(TagCpuThreads); CREATE_ACC_TAG(TagFpgaSyclIntel); diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index 43829f951fde..327e53fab9f0 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Erik Zenker, Matthias Werner, René Widera, Bernhard Manfred Gruber, - * Jan Stephan, Antonio Di Pilato + * Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -13,7 +13,7 @@ #include "alpaka/acc/AccCpuOmp2Blocks.hpp" #include "alpaka/acc/AccCpuOmp2Threads.hpp" #include "alpaka/acc/AccCpuSerial.hpp" -#include "alpaka/acc/AccCpuSyclIntel.hpp" +#include "alpaka/acc/AccCpuSycl.hpp" #include "alpaka/acc/AccCpuTbbBlocks.hpp" #include "alpaka/acc/AccCpuThreads.hpp" #include "alpaka/acc/AccDevProps.hpp" @@ -75,7 +75,7 @@ #include "alpaka/core/Vectorize.hpp" // dev #include "alpaka/dev/DevCpu.hpp" -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/dev/DevCudaRt.hpp" #include "alpaka/dev/DevFpgaSyclIntel.hpp" #include "alpaka/dev/DevGenericSycl.hpp" @@ -89,7 +89,7 @@ #include "alpaka/dim/Traits.hpp" // event #include "alpaka/event/EventCpu.hpp" -#include "alpaka/event/EventCpuSyclIntel.hpp" +#include "alpaka/event/EventCpuSycl.hpp" #include "alpaka/event/EventCudaRt.hpp" #include "alpaka/event/EventFpgaSyclIntel.hpp" #include "alpaka/event/EventGenericSycl.hpp" @@ -114,7 +114,7 @@ #include "alpaka/kernel/TaskKernelCpuOmp2Blocks.hpp" #include "alpaka/kernel/TaskKernelCpuOmp2Threads.hpp" #include "alpaka/kernel/TaskKernelCpuSerial.hpp" -#include "alpaka/kernel/TaskKernelCpuSyclIntel.hpp" +#include "alpaka/kernel/TaskKernelCpuSycl.hpp" #include "alpaka/kernel/TaskKernelCpuTbbBlocks.hpp" #include "alpaka/kernel/TaskKernelCpuThreads.hpp" #include "alpaka/kernel/TaskKernelFpgaSyclIntel.hpp" @@ -133,7 +133,7 @@ #include "alpaka/mem/alloc/AllocCpuNew.hpp" #include "alpaka/mem/alloc/Traits.hpp" #include "alpaka/mem/buf/BufCpu.hpp" -#include "alpaka/mem/buf/BufCpuSyclIntel.hpp" +#include "alpaka/mem/buf/BufCpuSycl.hpp" #include "alpaka/mem/buf/BufCudaRt.hpp" #include "alpaka/mem/buf/BufFpgaSyclIntel.hpp" #include "alpaka/mem/buf/BufGenericSycl.hpp" @@ -178,7 +178,7 @@ #include "alpaka/offset/Traits.hpp" // platform #include "alpaka/pltf/PltfCpu.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #include "alpaka/pltf/PltfCudaRt.hpp" #include "alpaka/pltf/PltfFpgaSyclIntel.hpp" #include "alpaka/pltf/PltfGpuSyclIntel.hpp" @@ -186,6 +186,7 @@ #include "alpaka/pltf/Traits.hpp" // rand #include "alpaka/rand/RandDefault.hpp" +#include "alpaka/rand/RandGenericSycl.hpp" #include "alpaka/rand/RandPhilox.hpp" #include "alpaka/rand/RandStdLib.hpp" #include "alpaka/rand/RandUniformCudaHipRand.hpp" @@ -196,8 +197,8 @@ #include "alpaka/queue/Properties.hpp" #include "alpaka/queue/QueueCpuBlocking.hpp" #include "alpaka/queue/QueueCpuNonBlocking.hpp" -#include "alpaka/queue/QueueCpuSyclIntelBlocking.hpp" -#include "alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp" +#include "alpaka/queue/QueueCpuSyclBlocking.hpp" +#include "alpaka/queue/QueueCpuSyclNonBlocking.hpp" #include "alpaka/queue/QueueCudaRtBlocking.hpp" #include "alpaka/queue/QueueCudaRtNonBlocking.hpp" #include "alpaka/queue/QueueFpgaSyclIntelBlocking.hpp" diff --git a/include/alpaka/atomic/AtomicGenericSycl.hpp b/include/alpaka/atomic/AtomicGenericSycl.hpp index 742adcdfa3e7..8ebf608dc570 100644 --- a/include/alpaka/atomic/AtomicGenericSycl.hpp +++ b/include/alpaka/atomic/AtomicGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -14,7 +14,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -98,17 +98,12 @@ namespace alpaka inline auto casWithCondition(T* const addr, TEval&& eval) { auto ref = TRef{*addr}; - auto old_val = ref.load(); - auto assumed = T{}; - do + // prefer compare_exchange_weak when in a loop, assuming that eval is not expensive + while(!ref.compare_exchange_weak(old_val, eval(old_val))) { - assumed = old_val; - auto const new_val = eval(old_val); - old_val = ref.compare_exchange_strong(assumed, new_val); - } while(assumed != old_val); - + } return old_val; } @@ -182,7 +177,9 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_integral_v || std::is_floating_point_v, "SYCL atomics do not support this type"); + static_assert( + (std::is_integral_v || std::is_floating_point_v) &&(sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics do not support this type"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { @@ -195,11 +192,14 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_unsigned_v, "atomicInc only supported for unsigned types"); + static_assert( + std::is_unsigned_v && (sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics support only 32- and 64-bits unsigned integral types"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { - auto inc = [&value](auto old_val) { return (old_val >= value) ? static_cast(0) : (old_val + 1u); }; + auto inc = [&value](auto old_val) + { return (old_val >= value) ? static_cast(0) : (old_val + static_cast(1)); }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) return alpaka::detail::casWithCondition>(addr, inc); else @@ -212,12 +212,14 @@ namespace alpaka::trait template struct AtomicOp { - static_assert(std::is_unsigned_v, "atomicDec only supported for unsigned types"); + static_assert( + std::is_unsigned_v && (sizeof(T) == 4 || sizeof(T) == 8), + "SYCL atomics support only 32- and 64-bits unsigned integral types"); static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& value) -> T { - auto dec - = [&value](auto& old_val) { return ((old_val == 0) || (old_val > value)) ? value : (old_val - 1u); }; + auto dec = [&value](auto& old_val) + { return ((old_val == 0) || (old_val > value)) ? value : (old_val - static_cast(1)); }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) return alpaka::detail::casWithCondition>(addr, dec); else @@ -275,22 +277,21 @@ namespace alpaka::trait { static_assert(std::is_integral_v || std::is_floating_point_v, "SYCL atomics do not support this type"); - static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& compare, T const& value) -> T + static auto atomicOp(AtomicGenericSycl const&, T* const addr, T const& expected, T const& desired) -> T { - auto cas = [&compare, &value](auto& ref) + auto cas = [&expected, &desired](auto& ref) { - // SYCL stores the value in *addr to the "compare" parameter if the values are not equal. Since - // alpaka's interface does not expect this we need to copy "compare" to this function and forget it - // afterwards. - auto tmp = compare; - - // We always want to return the old value at the end. - const auto old = ref.load(); - - // This returns a bool telling us if the exchange happened or not. Useless in this case. - ref.compare_exchange_strong(tmp, value); - - return old; + auto expected_ = expected; + // Atomically compares the value of `ref` with the value of `expected`. + // If the values are equal, replaces the value of `ref` with `desired`. + // Otherwise updates `expected` with the value of `ref`. + // Returns a bool telling us if the exchange happened or not, but the Alpaka API does not make use of + // it. + ref.compare_exchange_strong(expected_, desired); + + // If the update succeded, return the previous value of `ref`. + // Otherwise, return the current value of `ref`. + return expected_; }; if(auto ptr = alpaka::detail::get_global_ptr(addr); ptr != nullptr) diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp index d13b78249867..4e2af194ddcd 100644 --- a/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp +++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynGenericSycl.hpp @@ -9,7 +9,7 @@ #include #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -35,9 +35,7 @@ namespace alpaka::trait { static auto getMem(BlockSharedMemDynGenericSycl const& shared) -> T* { - auto void_ptr = sycl::multi_ptr{shared.m_accessor}; - auto t_ptr = static_cast>(void_ptr); - return t_ptr.get(); + return reinterpret_cast(shared.m_accessor.get_pointer().get()); } }; } // namespace alpaka::trait diff --git a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp index baae7b0c8825..f92df9c051f0 100644 --- a/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp +++ b/include/alpaka/block/shared/st/BlockSharedMemStGenericSycl.hpp @@ -7,11 +7,12 @@ #include "alpaka/block/shared/st/Traits.hpp" #include "alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp" +#include #include #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp index 87d478eb9c19..67e97493fee4 100644 --- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp +++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp @@ -8,7 +8,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/core/BoostPredef.hpp b/include/alpaka/core/BoostPredef.hpp index 39621e964b0f..9a54f33200f8 100644 --- a/include/alpaka/core/BoostPredef.hpp +++ b/include/alpaka/core/BoostPredef.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Benjamin Worpitz, Matthias Werner, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan * SPDX-License-Identifier: MPL-2.0 */ @@ -68,3 +68,11 @@ # undef BOOST_COMP_PGI # define BOOST_COMP_PGI BOOST_COMP_PGI_EMULATED #endif + +// Intel LLVM compiler detection +#if !defined(BOOST_COMP_ICPX) +# if defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER) +// The version string for icpx 2023.1.0 is 20230100. In Boost.Predef this becomes (53,1,0). +# define BOOST_COMP_ICPX BOOST_PREDEF_MAKE_YYYYMMDD(__INTEL_LLVM_COMPILER) +# endif +#endif diff --git a/include/alpaka/core/Common.hpp b/include/alpaka/core/Common.hpp index 50b74df690be..6da0b9fbd620 100644 --- a/include/alpaka/core/Common.hpp +++ b/include/alpaka/core/Common.hpp @@ -1,4 +1,4 @@ -/* Copyright 2019 Axel Huebl, Benjamin Worpitz, Matthias Werner +/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, René Widera, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -76,13 +76,12 @@ //! Macro defining the inline function attribute. #if BOOST_LANG_CUDA || BOOST_LANG_HIP # define ALPAKA_FN_INLINE __forceinline__ -#else -# if BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) +#elif BOOST_COMP_MSVC || defined(BOOST_COMP_MSVC_EMULATED) // TODO: With C++20 [[msvc::forceinline]] can be used. -# define ALPAKA_FN_INLINE __forceinline -# else -# define ALPAKA_FN_INLINE [[gnu::always_inline]] inline -# endif +# define ALPAKA_FN_INLINE __forceinline +#else +// For gcc, clang, and clang-based compilers like Intel icpx +# define ALPAKA_FN_INLINE [[gnu::always_inline]] inline #endif //! This macro defines a variable lying in global accelerator device memory. @@ -117,6 +116,8 @@ #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX) \ || BOOST_LANG_HIP) # define ALPAKA_STATIC_ACC_MEM_GLOBAL __device__ +#elif defined(ALPAKA_ACC_SYCL_ENABLED) +# define ALPAKA_STATIC_ACC_MEM_GLOBAL _Pragma("GCC error \"The SYCL backend does not support global device variables.\"")) #else # define ALPAKA_STATIC_ACC_MEM_GLOBAL #endif @@ -153,6 +154,8 @@ #if((BOOST_LANG_CUDA && BOOST_COMP_CLANG_CUDA) || (BOOST_LANG_CUDA && BOOST_COMP_NVCC && BOOST_ARCH_PTX) \ || BOOST_LANG_HIP) # define ALPAKA_STATIC_ACC_MEM_CONSTANT __constant__ +#elif defined(ALPAKA_ACC_SYCL_ENABLED) +# define ALPAKA_STATIC_ACC_MEM_CONSTANT _Pragma("GCC error \"The SYCL backend does not support global device constants.\"")) #else # define ALPAKA_STATIC_ACC_MEM_CONSTANT #endif diff --git a/include/alpaka/core/Sycl.hpp b/include/alpaka/core/Sycl.hpp index cab73fe17f00..dbc8c1e0f748 100644 --- a/include/alpaka/core/Sycl.hpp +++ b/include/alpaka/core/Sycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -13,6 +13,7 @@ #include #include +#include // the #define printf(...) breaks if it is included afterwards #include #include #include @@ -21,7 +22,36 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include + +// if SYCL is enabled with the AMD backend the printf will be killed because of missing compiler support +# ifdef __AMDGCN__ +# define printf(...) +# else + +# ifdef __SYCL_DEVICE_ONLY__ +using AlpakaFormat = char const* [[clang::opencl_constant]]; +# else +using AlpakaFormat = char const*; +# endif + +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +# endif + +# define printf(FORMAT, ...) \ + do \ + { \ + static auto const format = AlpakaFormat{FORMAT}; \ + sycl::ext::oneapi::experimental::printf(format, ##__VA_ARGS__); \ + } while(false) + +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + +# endif // SYCL vector types trait specializations. namespace alpaka @@ -50,7 +80,6 @@ namespace alpaka // 2 component vector types sycl::char2, - sycl::schar2, sycl::uchar2, sycl::short2, sycl::ushort2, @@ -58,15 +87,12 @@ namespace alpaka sycl::uint2, sycl::long2, sycl::ulong2, - sycl::longlong2, - sycl::ulonglong2, sycl::float2, sycl::double2, sycl::half2, // 3 component vector types sycl::char3, - sycl::schar3, sycl::uchar3, sycl::short3, sycl::ushort3, @@ -74,15 +100,12 @@ namespace alpaka sycl::uint3, sycl::long3, sycl::ulong3, - sycl::longlong3, - sycl::ulonglong3, sycl::float3, sycl::double3, sycl::half3, // 4 component vector types sycl::char4, - sycl::schar4, sycl::uchar4, sycl::short4, sycl::ushort4, @@ -90,15 +113,12 @@ namespace alpaka sycl::uint4, sycl::long4, sycl::ulong4, - sycl::longlong4, - sycl::ulonglong4, sycl::float4, sycl::double4, sycl::half4, // 8 component vector types sycl::char8, - sycl::schar8, sycl::uchar8, sycl::short8, sycl::ushort8, @@ -106,15 +126,12 @@ namespace alpaka sycl::uint8, sycl::long8, sycl::ulong8, - sycl::longlong8, - sycl::ulonglong8, sycl::float8, sycl::double8, sycl::half8, // 16 component vector types sycl::char16, - sycl::schar16, sycl::uchar16, sycl::short16, sycl::ushort16, @@ -122,8 +139,6 @@ namespace alpaka sycl::uint16, sycl::long16, sycl::ulong16, - sycl::longlong16, - sycl::ulonglong16, sycl::float16, sycl::double16, sycl::half16> @@ -146,10 +161,7 @@ namespace alpaka::trait { using type = std::conditional_t, T, typename T::element_type>; }; -} // namespace alpaka::trait -namespace alpaka::trait -{ //! The SYCL vectors' extent get trait specialization. template struct GetExtent::value>, TExtent, std::enable_if_t::value>> diff --git a/include/alpaka/dev/DevCpuSyclIntel.hpp b/include/alpaka/dev/DevCpuSycl.hpp similarity index 65% rename from include/alpaka/dev/DevCpuSyclIntel.hpp rename to include/alpaka/dev/DevCpuSycl.hpp index 8add6869e3da..1a9909d31cd0 100644 --- a/include/alpaka/dev/DevCpuSyclIntel.hpp +++ b/include/alpaka/dev/DevCpuSycl.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/dev/DevGenericSycl.hpp" -#include "alpaka/pltf/PltfCpuSyclIntel.hpp" +#include "alpaka/pltf/PltfCpuSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using DevCpuSyclIntel = DevGenericSycl; + using DevCpuSycl = DevGenericSycl; } // namespace alpaka #endif diff --git a/include/alpaka/dev/DevGenericSycl.hpp b/include/alpaka/dev/DevGenericSycl.hpp index 5892df450df3..85c218365f3d 100644 --- a/include/alpaka/dev/DevGenericSycl.hpp +++ b/include/alpaka/dev/DevGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -27,7 +27,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -176,7 +176,12 @@ namespace alpaka::trait static auto getWarpSizes(DevGenericSycl const& dev) -> std::vector { auto const device = dev.getNativeHandle().first; - return device.template get_info(); + std::vector warp_sizes = device.template get_info(); + // The CPU runtime supports a sub-group size of 64, but the SYCL implementation currently does not + auto find64 = std::find(warp_sizes.begin(), warp_sizes.end(), 64); + if(find64 != warp_sizes.end()) + warp_sizes.erase(find64); + return warp_sizes; } }; @@ -204,7 +209,7 @@ namespace alpaka::trait template struct BufType, TElem, TDim, TIdx> { - using type = BufGenericSycl>; + using type = BufGenericSycl; }; //! The SYCL device platform type trait specialization. diff --git a/include/alpaka/event/EventCpuSyclIntel.hpp b/include/alpaka/event/EventCpuSycl.hpp similarity index 65% rename from include/alpaka/event/EventCpuSyclIntel.hpp rename to include/alpaka/event/EventCpuSycl.hpp index 21a752e28991..37e521484e93 100644 --- a/include/alpaka/event/EventCpuSyclIntel.hpp +++ b/include/alpaka/event/EventCpuSycl.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/event/EventGenericSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using EventCpuSyclIntel = EventGenericSycl; + using EventCpuSycl = EventGenericSycl; } // namespace alpaka #endif diff --git a/include/alpaka/event/EventGenericSycl.hpp b/include/alpaka/event/EventGenericSycl.hpp index 3e84baec3810..68011a0247cd 100644 --- a/include/alpaka/event/EventGenericSycl.hpp +++ b/include/alpaka/event/EventGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -17,7 +17,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -87,7 +87,7 @@ namespace alpaka::trait { static auto enqueue(QueueGenericSyclNonBlocking& queue, EventGenericSycl& event) { - event.setEvent(queue.m_impl->get_last_event()); + event.setEvent(queue.m_spQueueImpl->get_last_event()); } }; @@ -97,7 +97,7 @@ namespace alpaka::trait { static auto enqueue(QueueGenericSyclBlocking& queue, EventGenericSycl& event) { - event.setEvent(queue.m_impl->get_last_event()); + event.setEvent(queue.m_spQueueImpl->get_last_event()); } }; @@ -120,7 +120,7 @@ namespace alpaka::trait { static auto waiterWaitFor(QueueGenericSyclNonBlocking& queue, EventGenericSycl const& event) { - queue.m_impl->register_dependency(event.getNativeHandle()); + queue.m_spQueueImpl->register_dependency(event.getNativeHandle()); } }; @@ -130,7 +130,7 @@ namespace alpaka::trait { static auto waiterWaitFor(QueueGenericSyclBlocking& queue, EventGenericSycl const& event) { - queue.m_impl->register_dependency(event.getNativeHandle()); + queue.m_spQueueImpl->register_dependency(event.getNativeHandle()); } }; diff --git a/include/alpaka/example/ExampleDefaultAcc.hpp b/include/alpaka/example/ExampleDefaultAcc.hpp index f90b3653c9f4..6ceb874dd708 100644 --- a/include/alpaka/example/ExampleDefaultAcc.hpp +++ b/include/alpaka/example/ExampleDefaultAcc.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Jeffrey Kelling, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -24,16 +24,16 @@ namespace alpaka using ExampleDefaultAcc = alpaka::AccCpuOmp2Threads; #elif defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) using ExampleDefaultAcc = alpaka::AccCpuThreads; -#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) - using ExampleDefaultAcc = alpaka::AccCpuSerial; #elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) # if defined(ALPAKA_SYCL_ONEAPI_CPU) - using ExampleDefaultAcc = alpaka::AccCpuSyclIntel; + using ExampleDefaultAcc = alpaka::AccCpuSycl; # elif defined(ALPAKA_SYCL_ONEAPI_FPGA) using ExampleDefaultAcc = alpaka::AccFpgaSyclIntel; # elif defined(ALPAKA_SYCL_ONEAPI_GPU) using ExampleDefaultAcc = alpaka::AccGpuSyclIntel; # endif +#elif defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + using ExampleDefaultAcc = alpaka::AccCpuSerial; #else class ExampleDefaultAcc; # warning "No supported backend selected." diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp index 88628d343c18..54ef78014f1f 100644 --- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp +++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::bt { @@ -23,11 +23,11 @@ namespace alpaka::bt public: using IdxBtBase = IdxBtGenericSycl; - explicit IdxBtGenericSycl(sycl::nd_item work_item) : my_item{work_item} + explicit IdxBtGenericSycl(sycl::nd_item work_item) : m_item_bt{work_item} { } - sycl::nd_item my_item; + sycl::nd_item m_item_bt; }; } // namespace alpaka::bt @@ -49,19 +49,19 @@ namespace alpaka::trait static auto getIdx(bt::IdxBtGenericSycl const& idx, TWorkDiv const&) -> Vec { if constexpr(TDim::value == 1) - return Vec{static_cast(idx.my_item.get_local_id(0))}; + return Vec{static_cast(idx.m_item_bt.get_local_id(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(idx.my_item.get_local_id(1)), - static_cast(idx.my_item.get_local_id(0))}; + static_cast(idx.m_item_bt.get_local_id(1)), + static_cast(idx.m_item_bt.get_local_id(0))}; } else { return Vec{ - static_cast(idx.my_item.get_local_id(2)), - static_cast(idx.my_item.get_local_id(1)), - static_cast(idx.my_item.get_local_id(0))}; + static_cast(idx.m_item_bt.get_local_id(2)), + static_cast(idx.m_item_bt.get_local_id(1)), + static_cast(idx.m_item_bt.get_local_id(0))}; } } }; diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp index 8fa8a5bb0f35..42547effd6c3 100644 --- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp +++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::gb { @@ -23,11 +23,11 @@ namespace alpaka::gb public: using IdxGbBase = IdxGbGenericSycl; - explicit IdxGbGenericSycl(sycl::nd_item work_item) : my_item{work_item} + explicit IdxGbGenericSycl(sycl::nd_item work_item) : m_item_gb{work_item} { } - sycl::nd_item my_item; + sycl::nd_item m_item_gb; }; } // namespace alpaka::gb @@ -49,19 +49,19 @@ namespace alpaka::trait static auto getIdx(gb::IdxGbGenericSycl const& idx, TWorkDiv const&) { if constexpr(TDim::value == 1) - return Vec(static_cast(idx.my_item.get_group(0))); + return Vec(static_cast(idx.m_item_gb.get_group(0))); else if constexpr(TDim::value == 2) { return Vec( - static_cast(idx.my_item.get_group(1)), - static_cast(idx.my_item.get_group(0))); + static_cast(idx.m_item_gb.get_group(1)), + static_cast(idx.m_item_gb.get_group(0))); } else { return Vec( - static_cast(idx.my_item.get_group(2)), - static_cast(idx.my_item.get_group(1)), - static_cast(idx.my_item.get_group(0))); + static_cast(idx.m_item_gb.get_group(2)), + static_cast(idx.m_item_gb.get_group(1)), + static_cast(idx.m_item_gb.get_group(0))); } } }; diff --git a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp index 700442fb4520..395043a9cd95 100644 --- a/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp +++ b/include/alpaka/intrinsic/IntrinsicGenericSycl.hpp @@ -11,7 +11,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { diff --git a/include/alpaka/kernel/SyclSubgroupSize.hpp b/include/alpaka/kernel/SyclSubgroupSize.hpp new file mode 100644 index 000000000000..b56b652c7b9e --- /dev/null +++ b/include/alpaka/kernel/SyclSubgroupSize.hpp @@ -0,0 +1,97 @@ +/* Copyright 2023 Andrea Bocci, Aurora Perego + * SPDX-License-Identifier: MPL-2.0 + */ + +#ifdef ALPAKA_ACC_SYCL_ENABLED + +# ifdef __SYCL_DEVICE_ONLY__ + +# if defined(__SYCL_TARGET_INTEL_GPU_BDW__) || /* Broadwell Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_SKL__) || /* Skylake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_KBL__) || /* Kaby Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_CFL__) || /* Coffee Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_APL__) || /* Apollo Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_GLK__) || /* Gemini Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_WHL__) || /* Whiskey Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_AML__) || /* Amber Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_CML__) || /* Comet Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ICLLP__) || /* Ice Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_TGLLP__) || /* Tiger Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_RKL__) || /* Rocket Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_S__) || /* Alder Lake S Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_RPL_S__) || /* Raptor Lake Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_P__) || /* Alder Lake P Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ADL_N__) || /* Alder Lake N Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_DG1__) || /* DG1 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G10__) || /* Alchemist G10 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G11__) || /* Alchemist G11 Intel graphics architecture */ \ + defined(__SYCL_TARGET_INTEL_GPU_ACM_G12__) /* Alchemist G12 Intel graphics architecture */ + +# define SYCL_SUBGROUP_SIZE (8 | 16 | 32) + +# elif defined(__SYCL_TARGET_INTEL_GPU_PVC__) /* Ponte Vecchio Intel graphics architecture */ + +# define SYCL_SUBGROUP_SIZE (16 | 32) + +# elif defined(__SYCL_TARGET_INTEL_X86_64__) /* generate code ahead of time for x86_64 CPUs */ + +# define SYCL_SUBGROUP_SIZE (4 | 8 | 16 | 32 | 64) + +# elif defined(__SYCL_TARGET_NVIDIA_GPU_SM_50__) || /* NVIDIA Maxwell architecture (compute capability 5.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_52__) || /* NVIDIA Maxwell architecture (compute capability 5.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_53__) || /* NVIDIA Jetson TX1 / Nano (compute capability 5.3) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_60__) || /* NVIDIA Pascal architecture (compute capability 6.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_61__) || /* NVIDIA Pascal architecture (compute capability 6.1) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_62__) || /* NVIDIA Jetson TX2 (compute capability 6.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_70__) || /* NVIDIA Volta architecture (compute capability 7.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_72__) || /* NVIDIA Jetson AGX (compute capability 7.2) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_75__) || /* NVIDIA Turing architecture (compute capability 7.5) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_80__) || /* NVIDIA Ampere architecture (compute capability 8.0) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_86__) || /* NVIDIA Ampere architecture (compute capability 8.6) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_87__) || /* NVIDIA Jetson/Drive AGX Orin (compute capability 8.7) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_89__) || /* NVIDIA Ada Lovelace arch. (compute capability 8.9) */ \ + defined(__SYCL_TARGET_NVIDIA_GPU_SM_90__) /* NVIDIA Hopper architecture (compute capability 9.0) */ + +# define SYCL_SUBGROUP_SIZE (32) + +# elif defined(__SYCL_TARGET_AMD_GPU_GFX700__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX701__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX702__) || /* AMD GCN 2.0 Sea Islands architecture (gfx 7.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX801__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX802__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX803__) || /* AMD GCN 4.0 Arctic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX805__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX810__) || /* AMD GCN 3.0 Volcanic Islands architecture (gfx 8.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX900__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX902__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX904__) || /* AMD GCN 5.0 Vega architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX906__) || /* AMD GCN 5.1 Vega II architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX908__) || /* AMD CDNA 1.0 Arcturus architecture (gfx 9.0) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX90A__) /* AMD CDNA 2.0 Aldebaran architecture (gfx 9.0) */ + +# define SYCL_SUBGROUP_SIZE (64) + +# elif defined(__SYCL_TARGET_AMD_GPU_GFX1010__) || /* AMD RDNA 1.0 Navi 10 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1011__) || /* AMD RDNA 1.0 Navi 12 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1012__) || /* AMD RDNA 1.0 Navi 14 architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1013__) || /* AMD RDNA 2.0 Oberon architecture (gfx 10.1) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1030__) || /* AMD RDNA 2.0 Navi 21 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1031__) || /* AMD RDNA 2.0 Navi 22 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1032__) || /* AMD RDNA 2.0 Navi 23 architecture (gfx 10.3) */ \ + defined(__SYCL_TARGET_AMD_GPU_GFX1034__) /* AMD RDNA 2.0 Navi 24 architecture (gfx 10.3) */ + +# define SYCL_SUBGROUP_SIZE (32 | 64) + +# else // __SYCL_TARGET_* + +# define SYCL_SUBGROUP_SIZE (0) /* unknown target */ + +# endif // __SYCL_TARGET_* + +# else + +# define SYCL_SUBGROUP_SIZE (0) /* host compilation */ + +# endif // __SYCL_DEVICE_ONLY__ + +#endif // ALPAKA_ACC_SYCL_ENABLED diff --git a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp b/include/alpaka/kernel/TaskKernelCpuSycl.hpp similarity index 67% rename from include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp rename to include/alpaka/kernel/TaskKernelCpuSycl.hpp index 18b67d51a8e1..abb8c9a81d03 100644 --- a/include/alpaka/kernel/TaskKernelCpuSyclIntel.hpp +++ b/include/alpaka/kernel/TaskKernelCpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -11,11 +11,10 @@ namespace alpaka { template - class AccCpuSyclIntel; + class AccCpuSycl; template - using TaskKernelCpuSyclIntel - = TaskKernelGenericSycl, TDim, TIdx, TKernelFnObj, TArgs...>; + using TaskKernelCpuSycl = TaskKernelGenericSycl, TDim, TIdx, TKernelFnObj, TArgs...>; } // namespace alpaka #endif diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index 8dfb20fc6d92..92eab6fa5221 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -1,10 +1,11 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/acc/Traits.hpp" +#include "alpaka/block/shared/dyn/BlockSharedDynMemberAllocKiB.hpp" #include "alpaka/core/BoostPredef.hpp" #include "alpaka/core/STLTuple/STLTuple.hpp" #include "alpaka/core/Sycl.hpp" @@ -12,6 +13,7 @@ #include "alpaka/dev/Traits.hpp" #include "alpaka/dim/Traits.hpp" #include "alpaka/idx/Traits.hpp" +#include "alpaka/kernel/SyclSubgroupSize.hpp" #include "alpaka/kernel/Traits.hpp" #include "alpaka/mem/buf/sycl/Accessor.hpp" #include "alpaka/pltf/Traits.hpp" @@ -28,7 +30,71 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-lambda-capture" +# pragma clang diagnostic ignored "-Wunused-parameter" +# endif + +# include + +# define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \ + { \ + auto acc = TAcc{ \ + item_elements, \ + work_item, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy}; \ + core::apply( \ + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, \ + k_args); \ + }); + +# define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) \ + { \ + auto acc = TAcc{ \ + item_elements, \ + work_item, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy}; \ + core::apply( \ + [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, \ + k_args); \ + }); + +# define THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL \ + throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \ + cgh.parallel_for( \ + sycl::nd_range{global_size, local_size}, \ + [item_elements, \ + dyn_shared_accessor, \ + st_shared_accessor, \ + global_fence_dummy, \ + local_fence_dummy, \ + k_func, \ + k_args](sycl::nd_item work_item) {}); namespace alpaka::detail { @@ -48,9 +114,8 @@ namespace alpaka::detail template inline auto require( sycl::handler& cgh, - experimental:: - Accessor::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes> - acc, + experimental::Accessor::value, TAccessModes>, TElem, TIdx, TDim, TAccessModes> + acc, special) { cgh.require(acc.m_accessor); @@ -120,48 +185,87 @@ namespace alpaka auto k_func = m_kernelFnObj; auto k_args = m_args; -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - // Set up device-side printing with (user-chosen value) KiB per block for the output buffer. - constexpr auto buf_size = std::size_t{ALPAKA_SYCL_IOSTREAM_KIB * 1024}; - auto buf_per_work_item = std::size_t{}; - if constexpr(TDim::value == 1) - buf_per_work_item = buf_size / static_cast(group_items[0]); - else if constexpr(TDim::value == 2) - buf_per_work_item = buf_size / static_cast(group_items[0] * group_items[1]); - else - buf_per_work_item - = buf_size / static_cast(group_items[0] * group_items[1] * group_items[2]); - - assert(buf_per_work_item > 0); + constexpr std::size_t sub_group_size = trait::warpSize; + bool supported = false; - auto output_stream = sycl::stream{buf_size, buf_per_work_item, cgh}; -# endif - cgh.parallel_for>( - sycl::nd_range{global_size, local_size}, - [=](sycl::nd_item work_item) - { -# ifdef ALPAKA_SYCL_IOSTREAM_ENABLED - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy, - output_stream}; + if constexpr(sub_group_size == 0) + { + // no explicit subgroup size requirement + LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE + supported = true; + } + else + { +# if(SYCL_SUBGROUP_SIZE == 0) + // no explicit SYCL target, assume JIT compilation + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) + supported = true; # else - auto acc = TAcc{ - item_elements, - work_item, - dyn_shared_accessor, - st_shared_accessor, - global_fence_dummy, - local_fence_dummy}; + // check if the kernel should be launched with a subgroup size of 4 + if constexpr(sub_group_size == 4) + { +# if(SYCL_SUBGROUP_SIZE & 4) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(4) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 8 + if constexpr(sub_group_size == 8) + { +# if(SYCL_SUBGROUP_SIZE & 8) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(8) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 16 + if constexpr(sub_group_size == 16) + { +# if(SYCL_SUBGROUP_SIZE & 16) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(16) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 32 + if constexpr(sub_group_size == 32) + { +# if(SYCL_SUBGROUP_SIZE & 32) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(32) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } + + // check if the kernel should be launched with a subgroup size of 64 + if constexpr(sub_group_size == 64) + { +# if(SYCL_SUBGROUP_SIZE & 64) + LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(64) + supported = true; +# else + // empty kernel, required to keep SYCL happy + THROW_AND_LAUNCH_EMPTY_SYCL_KERNEL +# endif + } # endif - core::apply( - [k_func, &acc](typename std::decay_t const&... args) { k_func(acc, args...); }, - k_args); - }); + + // this subgroup size is not supported, raise an exception + if(not supported) + throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); + } } static constexpr auto is_sycl_task = true; @@ -203,8 +307,13 @@ namespace alpaka TKernelFnObj m_kernelFnObj; core::Tuple...> m_args; }; + } // namespace alpaka +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + namespace alpaka::trait { //! The SYCL execution task accelerator type trait specialization. @@ -243,4 +352,6 @@ namespace alpaka::trait }; } // namespace alpaka::trait +# undef LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS + #endif diff --git a/include/alpaka/kernel/Traits.hpp b/include/alpaka/kernel/Traits.hpp index 33032cdb5c57..384b82873759 100644 --- a/include/alpaka/kernel/Traits.hpp +++ b/include/alpaka/kernel/Traits.hpp @@ -1,4 +1,5 @@ -/* Copyright 2022 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber +/* Copyright 2023 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov, Jan Stephan, Bernhard Manfred Gruber, + * Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -68,6 +69,21 @@ namespace alpaka } }; + //! The trait for getting the warp size required by a kernel. + //! + //! \tparam TKernelFnObj The kernel function object. + //! \tparam TAcc The accelerator. + //! + //! The default implementation returns 0, which lets the accelerator compiler and runtime choose the warp size. + template + struct WarpSize : std::integral_constant + { + }; + + //! This is a shortcut for the trait defined above + template + inline constexpr std::uint32_t warpSize = WarpSize::value; + //! The trait for getting the schedule to use when a kernel is run using the CpuOmp2Blocks accelerator. //! //! Has no effect on other accelerators. diff --git a/include/alpaka/math/MathGenericSycl.hpp b/include/alpaka/math/MathGenericSycl.hpp index cf7233826741..53898f210680 100644 --- a/include/alpaka/math/MathGenericSycl.hpp +++ b/include/alpaka/math/MathGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Sergei Bastrakov, René Widera, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include //! The mathematical operation specifics. namespace alpaka::math @@ -279,7 +279,7 @@ namespace alpaka::math::trait if constexpr(std::is_integral_v) return sycl::atan2(0.0, static_cast(argument)); else if constexpr(std::is_floating_point_v) - return sycl::atan2(TArgument{0.0}, argument); + return sycl::atan2(static_cast(0.0), argument); else static_assert(!sizeof(TArgument), "Unsupported data type"); } @@ -333,9 +333,11 @@ namespace alpaka::math::trait Tx, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::Atan2GenericSycl const&, Ty const& y, Tx const& x) { - return sycl::atan2(y, x); + return sycl::atan2(static_cast(y), static_cast(x)); } }; @@ -432,9 +434,11 @@ namespace alpaka::math::trait Ty, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::FmodGenericSycl const&, Tx const& x, Ty const& y) { - return sycl::fmod(x, y); + return sycl::fmod(static_cast(x), static_cast(y)); } }; @@ -444,7 +448,7 @@ namespace alpaka::math::trait { auto operator()(math::IsfiniteGenericSycl const&, TArg const& arg) { - return sycl::isfinite(arg); + return static_cast(sycl::isfinite(arg)); } }; @@ -454,7 +458,7 @@ namespace alpaka::math::trait { auto operator()(math::IsinfGenericSycl const&, TArg const& arg) { - return sycl::isinf(arg); + return static_cast(sycl::isinf(arg)); } }; @@ -464,7 +468,7 @@ namespace alpaka::math::trait { auto operator()(math::IsnanGenericSycl const&, TArg const& arg) { - return sycl::isnan(arg); + return static_cast(sycl::isnan(arg)); } }; @@ -482,18 +486,20 @@ namespace alpaka::math::trait template struct Max && std::is_arithmetic_v>> { + using TCommon = std::common_type_t; + auto operator()(math::MaxGenericSycl const&, Tx const& x, Ty const& y) { if constexpr(std::is_integral_v && std::is_integral_v) - return sycl::max(x, y); + return sycl::max(static_cast(x), static_cast(y)); else if constexpr(std::is_floating_point_v && std::is_floating_point_v) - return sycl::fmax(x, y); + return sycl::fmax(static_cast(x), static_cast(y)); else if constexpr( (std::is_floating_point_v && std::is_integral_v) || (std::is_integral_v && std::is_floating_point_v) ) return sycl::fmax(static_cast(x), static_cast(y)); // mirror CUDA back-end else - static_assert(!sizeof(Tx), "Unsupported data type"); + static_assert(!sizeof(Tx), "Unsupported data types"); } }; @@ -512,7 +518,7 @@ namespace alpaka::math::trait || (std::is_integral_v && std::is_floating_point_v) ) return sycl::fmin(static_cast(x), static_cast(y)); // mirror CUDA back-end else - static_assert(!sizeof(Tx), "Unsupported data type"); + static_assert(!sizeof(Tx), "Unsupported data types"); } }; @@ -524,9 +530,11 @@ namespace alpaka::math::trait TExp, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::PowGenericSycl const&, TBase const& base, TExp const& exp) { - return sycl::pow(base, exp); + return sycl::pow(static_cast(base), static_cast(exp)); } }; @@ -538,9 +546,11 @@ namespace alpaka::math::trait Ty, std::enable_if_t && std::is_floating_point_v>> { + using TCommon = std::common_type_t; + auto operator()(math::RemainderGenericSycl const&, Tx const& x, Ty const& y) { - return sycl::remainder(x, y); + return sycl::remainder(static_cast(x), static_cast(y)); } }; @@ -580,9 +590,9 @@ namespace alpaka::math::trait { auto operator()(math::RsqrtGenericSycl const&, TArg const& arg) { - if(std::is_floating_point_v) + if constexpr(std::is_floating_point_v) return sycl::rsqrt(arg); - else if(std::is_integral_v) + else if constexpr(std::is_integral_v) return sycl::rsqrt(static_cast(arg)); // mirror CUDA back-end and use double for ints else static_assert(!sizeof(TArg), "Unsupported data type"); diff --git a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp b/include/alpaka/mem/buf/BufCpuSycl.hpp similarity index 64% rename from include/alpaka/mem/buf/BufCpuSyclIntel.hpp rename to include/alpaka/mem/buf/BufCpuSycl.hpp index 967dbc8141a4..1dbdc9f3a2ad 100644 --- a/include/alpaka/mem/buf/BufCpuSyclIntel.hpp +++ b/include/alpaka/mem/buf/BufCpuSycl.hpp @@ -1,10 +1,10 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/mem/buf/BufGenericSycl.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) @@ -12,7 +12,7 @@ namespace alpaka { template - using BufCpuSyclIntel = BufGenericSycl; + using BufCpuSycl = BufGenericSycl; } #endif diff --git a/include/alpaka/mem/buf/BufGenericSycl.hpp b/include/alpaka/mem/buf/BufGenericSycl.hpp index 825fad4e37f1..c8849cedf7dd 100644 --- a/include/alpaka/mem/buf/BufGenericSycl.hpp +++ b/include/alpaka/mem/buf/BufGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,7 @@ #include "alpaka/mem/buf/BufCpu.hpp" #include "alpaka/mem/buf/Traits.hpp" #include "alpaka/mem/view/Accessor.hpp" +#include "alpaka/mem/view/ViewAccessOps.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -19,27 +20,27 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { //! The SYCL memory buffer. - template - class BufGenericSycl + template + class BufGenericSycl : public internal::ViewAccessOps> { + public: static_assert( !std::is_const_v, "The elem type of the buffer can not be const because the C++ Standard forbids containers of const " "elements!"); static_assert(!std::is_const_v, "The idx type of the buffer can not be const!"); - public: //! Constructor - template - BufGenericSycl(TDev const& dev, sycl::buffer buffer, TExtent const& extent) + template + BufGenericSycl(DevGenericSycl const& dev, TElem* const pMem, Deleter deleter, TExtent const& extent) : m_dev{dev} , m_extentElements{getExtentVecEnd(extent)} - , m_buffer{buffer} + , m_spMem(pMem, std::move(deleter)) { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; @@ -53,92 +54,100 @@ namespace alpaka "The idx type of TExtent and the TIdx template parameter have to be identical!"); } - TDev m_dev; + DevGenericSycl m_dev; Vec m_extentElements; - sycl::buffer m_buffer; + std::shared_ptr m_spMem; }; } // namespace alpaka namespace alpaka::trait { //! The BufGenericSycl device type trait specialization. - template - struct DevType> + template + struct DevType> { - using type = TDev; + using type = DevGenericSycl; }; //! The BufGenericSycl device get trait specialization. - template - struct GetDev> + template + struct GetDev> { - static auto getDev(BufGenericSycl const& buf) + static auto getDev(BufGenericSycl const& buf) { return buf.m_dev; } }; //! The BufGenericSycl dimension getter trait specialization. - template - struct DimType> + template + struct DimType> { using type = TDim; }; //! The BufGenericSycl memory element type get trait specialization. - template - struct ElemType> + template + struct ElemType> { using type = TElem; }; //! The BufGenericSycl extent get trait specialization. - template - struct GetExtent> + template + struct GetExtent> { static_assert(TDim::value > TIdxIntegralConst::value, "Requested dimension out of bounds"); - static auto getExtent(BufGenericSycl const& buf) -> TIdx + static auto getExtent(BufGenericSycl const& buf) -> TIdx { return buf.m_extentElements[TIdxIntegralConst::value]; } }; //! The BufGenericSycl native pointer get trait specialization. - template - struct GetPtrNative> + template + struct GetPtrNative> { - static_assert( - !sizeof(TElem), - "Accessing device-side pointers on the host is not supported by the SYCL back-end"); - - static auto getPtrNative(BufGenericSycl const&) -> TElem const* + static auto getPtrNative(BufGenericSycl const& buf) -> TElem const* { - return nullptr; + return buf.m_spMem.get(); } - static auto getPtrNative(BufGenericSycl&) -> TElem* + static auto getPtrNative(BufGenericSycl& buf) -> TElem* { - return nullptr; + return buf.m_spMem.get(); } }; //! The BufGenericSycl pointer on device get trait specialization. - template - struct GetPtrDev, TDev> + template + struct GetPtrDev, DevGenericSycl> { - static_assert( - !sizeof(TElem), - "Accessing device-side pointers on the host is not supported by the SYCL back-end"); - - static auto getPtrDev(BufGenericSycl const&, TDev const&) -> TElem const* + static auto getPtrDev(BufGenericSycl const& buf, DevGenericSycl const& dev) + -> TElem const* { - return nullptr; + if(dev == getDev(buf)) + { + return buf.m_spMem.get(); + } + else + { + throw std::runtime_error("The buffer is not accessible from the given device!"); + } } - static auto getPtrDev(BufGenericSycl&, TDev const&) -> TElem* + static auto getPtrDev(BufGenericSycl& buf, DevGenericSycl const& dev) + -> TElem* { - return nullptr; + if(dev == getDev(buf)) + { + return buf.m_spMem.get(); + } + else + { + throw std::runtime_error("The buffer is not accessible from the given device!"); + } } }; @@ -147,68 +156,91 @@ namespace alpaka::trait struct BufAlloc> { template - static auto allocBuf(DevGenericSycl const& dev, TExtent const& ext) - -> BufGenericSycl> + static auto allocBuf(DevGenericSycl const& dev, TExtent const& extent) + -> BufGenericSycl { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL if constexpr(TDim::value == 0 || TDim::value == 1) { - auto const width = getWidth(ext); + auto const width = getWidth(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " ewb: " << widthBytes << '\n'; -# endif - - auto const range = sycl::range<1>{width}; - return {dev, sycl::buffer{range}, ext}; } else if constexpr(TDim::value == 2) { - auto const width = getWidth(ext); - auto const height = getHeight(ext); + auto const width = getWidth(extent); + auto const height = getHeight(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " eh: " << height << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n'; -# endif - - auto const range = sycl::range<2>{width, height}; - return {dev, sycl::buffer{range}, ext}; } else if constexpr(TDim::value == 3) { - auto const width = getWidth(ext); - auto const height = getHeight(ext); - auto const depth = getDepth(ext); + auto const width = getWidth(extent); + auto const height = getHeight(extent); + auto const depth = getDepth(extent); -# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL auto const widthBytes = width * static_cast(sizeof(TElem)); std::cout << __func__ << " ew: " << width << " eh: " << height << " ed: " << depth << " ewb: " << widthBytes << " pitch: " << widthBytes << '\n'; + } # endif - auto const range = sycl::range<3>{width, height, depth}; - return {dev, sycl::buffer{range}, ext}; - } + auto const& [nativeDev, nativeContext] = dev.getNativeHandle(); + TElem* memPtr = sycl::malloc_device( + static_cast(getExtentProduct(extent)), + nativeDev, + nativeContext); + // captured structured bindings are a C++20 extension + // auto deleter = [nativeContext](TElem* ptr) { sycl::free(ptr, nativeContext); }; + auto deleter = [&dev](TElem* ptr) { sycl::free(ptr, dev.getNativeHandle().second); }; + + return BufGenericSycl(dev, memPtr, std::move(deleter), extent); } }; + //! The BufGenericSycl stream-ordered memory allocation capability trait specialization. + template + struct HasAsyncBufSupport> : std::false_type + { + }; + //! The BufGenericSycl offset get trait specialization. - template - struct GetOffset> + template + struct GetOffset> { - static auto getOffset(BufGenericSycl const&) -> TIdx + static auto getOffset(BufGenericSycl const&) -> TIdx { return 0u; } }; + //! The pinned/mapped memory allocation trait specialization for the SYCL devices. + template + struct BufAllocMapped + { + template + static auto allocMappedBuf(DevCpu const& host, TExtent const& extent) -> BufCpu + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + // Allocate SYCL page-locked memory on the host, mapped into the TPltf address space and + // accessible to all devices in the TPltf. + auto ctx = TPltf::syclContext(); + TElem* memPtr = sycl::malloc_host(static_cast(getExtentProduct(extent)), ctx); + auto deleter = [ctx](TElem* ptr) { sycl::free(ptr, ctx); }; + + return BufCpu(host, memPtr, std::move(deleter), extent); + } + }; + //! The BufGenericSycl idx type trait specialization. - template - struct IdxType> + template + struct IdxType> { using type = TIdx; }; @@ -217,16 +249,13 @@ namespace alpaka::trait template struct GetPtrDev, DevGenericSycl> { - static_assert(!sizeof(TElem), "Accessing host pointers on the device is not supported by the SYCL back-end"); - - static auto getPtrDev(BufCpu const&, DevGenericSycl const&) -> TElem const* + static auto getPtrDev(BufCpu const& buf, DevGenericSycl const&) -> TElem const* { - return nullptr; + return getPtrNative(buf); } - - static auto getPtrDev(BufCpu&, DevGenericSycl const&) -> TElem* + static auto getPtrDev(BufCpu& buf, DevGenericSycl const&) -> TElem* { - return nullptr; + return getPtrNative(buf); } }; } // namespace alpaka::trait diff --git a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp index 0b27ab2ae1f9..cd12f975c8d4 100644 --- a/include/alpaka/mem/buf/BufGpuSyclIntel.hpp +++ b/include/alpaka/mem/buf/BufGpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2022 Jan Stephan, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -12,7 +12,7 @@ namespace alpaka { template - using BufGpuSyclIntel = BufGenericSycl; + using BufGpuSyclIntel = BufGenericSycl; } #endif diff --git a/include/alpaka/mem/buf/sycl/Accessor.hpp b/include/alpaka/mem/buf/sycl/Accessor.hpp index 5b7aeba11dac..a83cd3d7d8e9 100644 --- a/include/alpaka/mem/buf/sycl/Accessor.hpp +++ b/include/alpaka/mem/buf/sycl/Accessor.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -13,7 +13,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -112,30 +112,6 @@ namespace alpaka SyclAccessor m_accessor; VecType extents; }; - - namespace experimental::trait - { - namespace internal - { - template - struct IsView> : std::false_type - { - }; - } // namespace internal - - template - struct BuildAccessor> - { - template - static auto buildAccessor(BufGenericSycl& buffer) - { - using SyclAccessor = detail::SyclAccessor; - return Accessor{ - SyclAccessor{buffer.m_buffer}, - buffer.m_extentElements}; - } - }; - } // namespace experimental::trait } // namespace alpaka #endif diff --git a/include/alpaka/mem/buf/sycl/Common.hpp b/include/alpaka/mem/buf/sycl/Common.hpp index 80e9763ebf75..87058bc6ca92 100644 --- a/include/alpaka/mem/buf/sycl/Common.hpp +++ b/include/alpaka/mem/buf/sycl/Common.hpp @@ -12,7 +12,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { diff --git a/include/alpaka/mem/buf/sycl/Copy.hpp b/include/alpaka/mem/buf/sycl/Copy.hpp index e6b1ae03836e..8e8cf533f747 100644 --- a/include/alpaka/mem/buf/sycl/Copy.hpp +++ b/include/alpaka/mem/buf/sycl/Copy.hpp @@ -1,8 +1,7 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ - #pragma once #include "alpaka/core/Debug.hpp" @@ -14,51 +13,187 @@ #include "alpaka/extent/Traits.hpp" #include "alpaka/mem/buf/sycl/Common.hpp" #include "alpaka/mem/view/Traits.hpp" +#include "alpaka/meta/NdLoop.hpp" +#include "alpaka/queue/QueueGenericSyclBlocking.hpp" +#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #include #include #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { - template - using SrcAccessor = sycl:: - accessor; - - template - using DstAccessor = sycl::accessor< - TElem, - TDim, - sycl::access_mode::write, - sycl::target::global_buffer, - sycl::access::placeholder::true_t>; - - enum class Direction + //! The SYCL device memory copy task base. + template + struct TaskCopySyclBase { - h2d, - d2h, - d2d + static_assert( + std::is_same_v>, std::remove_const_t>>, + "The source and the destination view are required to have the same element type!"); + using ExtentSize = Idx; + using DstSize = Idx; + using SrcSize = Idx; + using Elem = alpaka::Elem; + + template + TaskCopySyclBase(TViewFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + : m_extent(getExtentVec(extent)) +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) + , m_dstExtent(getExtentVec(viewDst)) + , m_srcExtent(getExtentVec(viewSrc)) +# endif + , m_dstPitchBytes(getPitchBytesVec(viewDst)) + , m_srcPitchBytes(getPitchBytesVec(viewSrc)) + , m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) + , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) + { + if constexpr(TDim::value > 0) + { + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); + ALPAKA_ASSERT((castVec(m_extent) <= m_srcExtent).foldrAll(std::logical_or())); + } + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + auto printDebug() const -> void + { + std::cout << __func__ << " e: " << m_extent << " ewb: " << this->m_extentWidthBytes + << " de: " << m_dstExtent << " dptr: " << reinterpret_cast(m_dstMemNative) + << " se: " << m_srcExtent << " sptr: " << reinterpret_cast(m_srcMemNative) + << std::endl; + } +# endif + + Vec const m_extent; +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + ExtentSize const m_extentWidthBytes; + Vec const m_dstExtent; + Vec const m_srcExtent; +# endif + + Vec const m_dstPitchBytes; + Vec const m_srcPitchBytes; + std::uint8_t* const m_dstMemNative; + std::uint8_t const* const m_srcMemNative; + static constexpr auto is_sycl_task = true; }; - template - struct TaskCopySycl + //! The SYCL device ND memory copy task. + template + struct TaskCopySycl : public TaskCopySyclBase { - auto operator()(sycl::handler& cgh) const -> void + using DimMin1 = DimInt; + using typename TaskCopySyclBase::ExtentSize; + using typename TaskCopySyclBase::DstSize; + using typename TaskCopySyclBase::SrcSize; + + using TaskCopySyclBase::TaskCopySyclBase; + + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event { - if constexpr(TDirection == Direction::d2h || TDirection == Direction::d2d) - cgh.require(m_src); + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one + // iteration. + Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); + // [z, y, x] -> [y, x] because the z pitch (the full size of the buffer) is not required. + Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + Vec const srcPitchBytesWithoutOutmost(subVecEnd(this->m_srcPitchBytes)); + + // Record an event for each memcpy call + std::vector events; + events.reserve(static_cast(extentWithoutInnermost.prod())); + + if(static_cast(this->m_extent.prod()) != 0u) + { + meta::ndLoopIncIdx( + extentWithoutInnermost, + [&](Vec const& idx) + { + events.push_back(queue.memcpy( + reinterpret_cast( + this->m_dstMemNative + + (castVec(idx) * dstPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + reinterpret_cast( + this->m_srcMemNative + + (castVec(idx) * srcPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + static_cast(this->m_extentWidthBytes), + requirements)); + }); + } + + // Return an event that depends on all the events assciated to the memcpy calls + return queue.ext_oneapi_submit_barrier(events); + } + }; - if constexpr(TDirection == Direction::h2d || TDirection == Direction::d2d) - cgh.require(m_dst); + //! The SYCL device 1D memory copy task. + template + struct TaskCopySycl, TViewDst, TViewSrc, TExtent> + : TaskCopySyclBase, TViewDst, TViewSrc, TExtent> + { + using TaskCopySyclBase, TViewDst, TViewSrc, TExtent>::TaskCopySyclBase; + using Elem = alpaka::Elem; - cgh.copy(m_src, m_dst); + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + return queue.memcpy( + reinterpret_cast(this->m_dstMemNative), + reinterpret_cast(this->m_srcMemNative), + sizeof(Elem) * static_cast(this->m_extent.prod()), + requirements); + } + else + { + return queue.ext_oneapi_submit_barrier(); + } } + }; + + //! The scalar SYCL memory copy trait. + template + struct TaskCopySycl, TViewDst, TViewSrc, TExtent> + { + static_assert( + std::is_same_v>, std::remove_const_t>>, + "The source and the destination view are required to have the same element type!"); - TSrc m_src; - TDst m_dst; + using Elem = alpaka::Elem; + + template + TaskCopySycl(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, [[maybe_unused]] TExtent const& extent) + : m_dstMemNative(reinterpret_cast(getPtrNative(viewDst))) + , m_srcMemNative(reinterpret_cast(getPtrNative(viewSrc))) + { + // all zero-sized extents are equivalent + ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(viewDst).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(viewSrc).prod() == 1u); + } + + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event + { + return queue.memcpy(m_dstMemNative, m_srcMemNative, sizeof(Elem), requirements); + } + + void* m_dstMemNative; + void const* m_srcMemNative; static constexpr auto is_sycl_task = true; }; } // namespace alpaka::detail @@ -67,76 +202,44 @@ namespace alpaka::detail namespace alpaka::trait { //! The SYCL host-to-device memory copy trait specialization. - template + template struct CreateTaskMemcpy, DevCpu> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = ElemType const*; - using DstType = alpaka::detail::DstAccessor; - - auto const range = detail::make_sycl_range(ext); - auto const offset = detail::make_sycl_offset(viewDst); - - return detail::TaskCopySycl{ - getPtrNative(viewSrc), - DstType{viewDst.m_buffer, range, offset}}; + return {std::forward(viewDst), viewSrc, extent}; } }; //! The SYCL device-to-host memory copy trait specialization. - template + template struct CreateTaskMemcpy> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = alpaka::detail::SrcAccessor; - using DstType = ElemType*; - - auto const range = detail::make_sycl_range(ext); - auto const offset = detail::make_sycl_offset(viewSrc); - - auto view_src = const_cast(viewSrc); - - return detail::TaskCopySycl{ - SrcType{view_src.m_buffer, range, offset}, - getPtrNative(viewDst)}; + return {std::forward(viewDst), viewSrc, extent}; } }; //! The SYCL device-to-device memory copy trait specialization. - template + template struct CreateTaskMemcpy, DevGenericSycl> { template - static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& ext) + static auto createTaskMemcpy(TViewDstFwd&& viewDst, TViewSrc const& viewSrc, TExtent const& extent) + -> alpaka::detail::TaskCopySycl, TViewSrc, TExtent> { ALPAKA_DEBUG_FULL_LOG_SCOPE; - constexpr auto copy_dim = static_cast(Dim::value); - using ElemType = Elem>; - using SrcType = alpaka::detail::SrcAccessor; - using DstType = alpaka::detail::DstAccessor; - - auto const range = detail::make_sycl_range(ext); - auto const offset_src = detail::make_sycl_offset(viewSrc); - auto const offset_dst = detail::make_sycl_offset(viewDst); - - auto view_src = const_cast(viewSrc); - - return detail::TaskCopySycl{ - SrcType{view_src.m_buffer, range, offset_src}, - DstType{viewDst.m_buffer, range, offset_dst}}; + return {std::forward(viewDst), viewSrc, extent}; } }; } // namespace alpaka::trait diff --git a/include/alpaka/mem/buf/sycl/Set.hpp b/include/alpaka/mem/buf/sycl/Set.hpp index f584e0461e87..01899cc18e1b 100644 --- a/include/alpaka/mem/buf/sycl/Set.hpp +++ b/include/alpaka/mem/buf/sycl/Set.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,9 @@ #include "alpaka/extent/Traits.hpp" #include "alpaka/mem/buf/sycl/Common.hpp" #include "alpaka/mem/view/Traits.hpp" +#include "alpaka/meta/NdLoop.hpp" +#include "alpaka/queue/QueueGenericSyclBlocking.hpp" +#include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #include "alpaka/queue/Traits.hpp" #include @@ -25,29 +28,171 @@ namespace alpaka namespace detail { - template - using Accessor = sycl::accessor< - std::byte, - TDim, - sycl::access_mode::write, - sycl::target::global_buffer, - sycl::access::placeholder::true_t>; - - //! The SYCL memory set trait. - template - struct TaskSetSycl + //! The SYCL ND memory set task base. + template + struct TaskSetSyclBase { - auto operator()(sycl::handler& cgh) const -> void + using ExtentSize = Idx; + using DstSize = Idx; + using Elem = alpaka::Elem; + + template + TaskSetSyclBase(TViewFwd&& view, std::uint8_t const& byte, TExtent const& extent) + : m_byte(byte) + , m_extent(getExtentVec(extent)) + , m_extentWidthBytes(m_extent[TDim::value - 1u] * static_cast(sizeof(Elem))) +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + , m_dstExtent(getExtentVec(view)) +# endif + + , m_dstPitchBytes(getPitchBytesVec(view)) + , m_dstMemNative(reinterpret_cast(getPtrNative(view))) + + { + ALPAKA_ASSERT((castVec(m_extent) <= m_dstExtent).foldrAll(std::logical_or())); + ALPAKA_ASSERT(m_extentWidthBytes <= m_dstPitchBytes[TDim::value - 1u]); + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + auto printDebug() const -> void + { + std::cout << __func__ << " e: " << this->m_extent << " ewb: " << this->m_extentWidthBytes + << " de: " << this->m_dstExtent << " dptr: " << reinterpret_cast(this->m_dstMemNative) + << " dpitchb: " << this->m_dstPitchBytes << std::endl; + } +# endif + + std::uint8_t const m_byte; + Vec const m_extent; + ExtentSize const m_extentWidthBytes; +# if(!defined(NDEBUG)) || (ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) + Vec const m_dstExtent; +# endif + Vec const m_dstPitchBytes; + std::uint8_t* const m_dstMemNative; + static constexpr auto is_sycl_task = true; + }; + + //! The SYCL device ND memory set task. + template + struct TaskSetSycl : public TaskSetSyclBase + { + using DimMin1 = DimInt; + using typename TaskSetSyclBase::ExtentSize; + using typename TaskSetSyclBase::DstSize; + + using TaskSetSyclBase::TaskSetSyclBase; + + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + // [z, y, x] -> [z, y] because all elements with the innermost x dimension are handled within one + // iteration. + Vec const extentWithoutInnermost(subVecBegin(this->m_extent)); + // [z, y, x] -> [y, x] because the z pitch (the full idx of the buffer) is not required. + Vec const dstPitchBytesWithoutOutmost(subVecEnd(this->m_dstPitchBytes)); + + // Record an event for each memcpy call + std::vector events; + events.reserve(static_cast(extentWithoutInnermost.prod())); + + if(static_cast(this->m_extent.prod()) != 0u) + { + meta::ndLoopIncIdx( + extentWithoutInnermost, + [&](Vec const& idx) + { + events.push_back(queue.memset( + reinterpret_cast( + this->m_dstMemNative + + (castVec(idx) * dstPitchBytesWithoutOutmost) + .foldrAll(std::plus())), + this->m_byte, + static_cast(this->m_extentWidthBytes), + requirements)); + }); + } + + // Return an event that depends on all the events assciated to the memcpy calls + return queue.ext_oneapi_submit_barrier(events); + } + }; + + //! The 1D SYCL memory set task. + template + struct TaskSetSycl, TView, TExtent> : public TaskSetSyclBase, TView, TExtent> + { + using TaskSetSyclBase, TView, TExtent>::TaskSetSyclBase; + + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + this->printDebug(); +# endif + if(static_cast(this->m_extent.prod()) != 0u) + { + return queue.memset( + reinterpret_cast(this->m_dstMemNative), + this->m_byte, + static_cast(this->m_extentWidthBytes), + requirements); + } + else + { + return queue.ext_oneapi_submit_barrier(); + } + } + }; + + //! The SYCL device scalar memory set task. + template + struct TaskSetSycl, TView, TExtent> + { + using ExtentSize = Idx; + using Scalar = Vec, ExtentSize>; + using DstSize = Idx; + using Elem = alpaka::Elem; + + template + TaskSetSycl(TViewFwd&& view, std::uint8_t const& byte, [[maybe_unused]] TExtent const& extent) + : m_byte(byte) + , m_dstMemNative(reinterpret_cast(getPtrNative(view))) + { + // all zero-sized extents are equivalent + ALPAKA_ASSERT(getExtentVec(extent).prod() == 1u); + ALPAKA_ASSERT(getExtentVec(view).prod() == 1u); + } + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + auto printDebug() const -> void { - cgh.require(m_accessor); - cgh.fill(m_accessor, m_value); + std::cout << __func__ << " e: " << Scalar() << " ewb: " << sizeof(Elem) << " de: " << Scalar() + << " dptr: " << reinterpret_cast(m_dstMemNative) << " dpitchb: " << Scalar() + << std::endl; } +# endif - TAccessor m_accessor; - std::byte m_value; - // Distinguish from non-alpaka types (= host tasks) + auto operator()(sycl::queue& queue, std::vector const& requirements) const -> sycl::event + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + +# if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL + printDebug(); +# endif + return queue.memset(reinterpret_cast(m_dstMemNative), m_byte, sizeof(Elem), requirements); + } + + std::uint8_t const m_byte; + std::uint8_t* const m_dstMemNative; static constexpr auto is_sycl_task = true; }; + } // namespace detail @@ -57,25 +202,15 @@ namespace alpaka template struct CreateTaskMemset> { - template - static auto createTaskMemset(TViewFwd&& view, std::uint8_t const& byte, TExtent const& ext) + template + static auto createTaskMemset(TView& view, std::uint8_t const& byte, TExtent const& extent) + -> detail::TaskSetSycl { - ALPAKA_DEBUG_FULL_LOG_SCOPE; - - constexpr auto set_dim = static_cast(Dim::value); - using TView = std::remove_reference_t; - using ElemType = Elem; - using DstType = alpaka::detail::Accessor; - - // Reinterpret as byte buffer - auto buf = view.m_buffer.template reinterpret(); - auto const byte_val = static_cast(byte); - - auto const range = detail::make_sycl_range(ext, sizeof(ElemType)); - return detail::TaskSetSycl{DstType{buf, range}, byte_val}; + return detail::TaskSetSycl(view, byte, extent); } }; + } // namespace trait -} // namespace alpaka +} // namespace alpaka #endif diff --git a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp index c3d96113e13f..0b7559f3d61b 100644 --- a/include/alpaka/mem/fence/MemFenceGenericSycl.hpp +++ b/include/alpaka/mem/fence/MemFenceGenericSycl.hpp @@ -8,7 +8,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -32,6 +32,13 @@ namespace alpaka static constexpr auto scope = sycl::memory_scope::device; static constexpr auto space = sycl::access::address_space::global_space; }; + + template<> + struct SyclFenceProps + { + static constexpr auto scope = sycl::memory_scope::device; + static constexpr auto space = sycl::access::address_space::global_space; + }; } // namespace detail //! The SYCL memory fence. diff --git a/include/alpaka/mem/view/ViewPlainPtr.hpp b/include/alpaka/mem/view/ViewPlainPtr.hpp index a092a6b26f60..56fadf03feb9 100644 --- a/include/alpaka/mem/view/ViewPlainPtr.hpp +++ b/include/alpaka/mem/view/ViewPlainPtr.hpp @@ -1,13 +1,16 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Sergei Bastrakov, Bernhard Manfred Gruber, + * Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ #pragma once #include "alpaka/dev/DevCpu.hpp" +#include "alpaka/dev/DevGenericSycl.hpp" #include "alpaka/dev/DevUniformCudaHipRt.hpp" #include "alpaka/mem/view/Traits.hpp" #include "alpaka/mem/view/ViewAccessOps.hpp" +#include "alpaka/meta/DependentFalseType.hpp" #include "alpaka/vec/Vec.hpp" #include @@ -178,6 +181,17 @@ namespace alpaka }; #endif +#if defined(ALPAKA_ACC_SYCL_ENABLED) + //! The SYCL device CreateStaticDevMemView trait specialization. + template + struct CreateStaticDevMemView> + { + static_assert( + meta::DependentFalseType::value, + "The SYCL backend does not support global device variables."); + }; +#endif + //! The CPU device CreateViewPlainPtr trait specialization. template<> struct CreateViewPlainPtr @@ -215,6 +229,26 @@ namespace alpaka }; #endif +#if defined(ALPAKA_ACC_SYCL_ENABLED) + //! The SYCL device CreateViewPlainPtr trait specialization. + template + struct CreateViewPlainPtr> + { + template + static auto createViewPlainPtr( + DevGenericSycl const& dev, + TElem* pMem, + TExtent const& extent, + TPitch const& pitch) + { + return alpaka::ViewPlainPtr, TElem, alpaka::Dim, alpaka::Idx>( + pMem, + dev, + extent, + pitch); + } + }; +#endif //! The ViewPlainPtr offset get trait specialization. template struct GetOffset> diff --git a/include/alpaka/pltf/PltfCpuSycl.hpp b/include/alpaka/pltf/PltfCpuSycl.hpp new file mode 100644 index 000000000000..c30793dd98a9 --- /dev/null +++ b/include/alpaka/pltf/PltfCpuSycl.hpp @@ -0,0 +1,44 @@ +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci + * SPDX-License-Identifier: MPL-2.0 + */ + +#pragma once + +#include "alpaka/dev/DevGenericSycl.hpp" +#include "alpaka/dev/Traits.hpp" +#include "alpaka/pltf/PltfGenericSycl.hpp" + +#include + +#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) + +# include + +namespace alpaka +{ + namespace detail + { + struct SyclCpuSelector + { + auto operator()(sycl::device const& dev) const -> int + { + return dev.is_cpu() ? 1 : -1; + } + }; + } // namespace detail + + //! The SYCL device manager. + using PltfCpuSycl = PltfGenericSycl; +} // namespace alpaka + +namespace alpaka::trait +{ + //! The SYCL device manager device type trait specialization. + template<> + struct DevType + { + using type = DevGenericSycl; // = DevCpuSycl + }; +} // namespace alpaka::trait + +#endif diff --git a/include/alpaka/pltf/PltfCpuSyclIntel.hpp b/include/alpaka/pltf/PltfCpuSyclIntel.hpp deleted file mode 100644 index b8f3815ddaed..000000000000 --- a/include/alpaka/pltf/PltfCpuSyclIntel.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci - * SPDX-License-Identifier: MPL-2.0 - */ - -#pragma once - -#include "alpaka/dev/DevGenericSycl.hpp" -#include "alpaka/dev/Traits.hpp" -#include "alpaka/pltf/PltfGenericSycl.hpp" - -#include - -#if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) - -# include - -namespace alpaka -{ - namespace detail - { - // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the - // linker anyway. -# if BOOST_COMP_CLANG -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wweak-vtables" -# endif - struct IntelCpuSelector final - { - auto operator()(sycl::device const& dev) const -> int - { - auto const& vendor = dev.get_info(); - auto const is_intel_cpu = (vendor.find("Intel(R) Corporation") != std::string::npos) && dev.is_cpu(); - - return is_intel_cpu ? 1 : -1; - } - }; -# if BOOST_COMP_CLANG -# pragma clang diagnostic pop -# endif - } // namespace detail - - //! The SYCL device manager. - using PltfCpuSyclIntel = PltfGenericSycl; -} // namespace alpaka - -namespace alpaka::trait -{ - //! The SYCL device manager device type trait specialization. - template<> - struct DevType - { - using type = DevGenericSycl; // = DevCpuSyclIntel - }; -} // namespace alpaka::trait - -#endif diff --git a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp index ef0b0cfa35c6..9b5e1b100373 100644 --- a/include/alpaka/pltf/PltfFpgaSyclIntel.hpp +++ b/include/alpaka/pltf/PltfFpgaSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Andrea Bocci, Luca Ferragina * SPDX-License-Identifier: MPL-2.0 */ @@ -10,7 +10,7 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_FPGA) -# include +# include # include diff --git a/include/alpaka/pltf/PltfGenericSycl.hpp b/include/alpaka/pltf/PltfGenericSycl.hpp index e1e0d38ea174..8c67f5f50c31 100644 --- a/include/alpaka/pltf/PltfGenericSycl.hpp +++ b/include/alpaka/pltf/PltfGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -20,7 +20,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -127,7 +127,7 @@ namespace alpaka::trait # if ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL printDeviceProperties(sycl_dev); # elif ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL - std::cout << __func__ << sycl_dev.get_info() << '\n'; + std::cout << __func__ << sycl_dev.template get_info() << '\n'; # endif using SyclPltf = alpaka::PltfGenericSycl; return typename DevType::type{sycl_dev, platform.syclContext()}; @@ -189,82 +189,62 @@ namespace alpaka::trait std::cout << "SYCL version: " << device.get_info() << '\n'; +# if !defined(BOOST_COMP_ICPX) + // Not defined by Level Zero back-end std::cout << "Backend version: " << device.get_info() << '\n'; +# endif std::cout << "Aspects: " << '\n'; - auto const aspects = device.get_info(); - for(auto const& asp : aspects) - { - switch(asp) - { - // Ignore the hardware types - we already have queried this info above - case sycl::aspect::cpu: - case sycl::aspect::gpu: - case sycl::aspect::accelerator: - case sycl::aspect::custom: - break; - case sycl::aspect::emulated: - std::cout << "\t* emulated\n"; - break; +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // These aspects are missing from oneAPI versions < 2023.2.0 + if(device.has(sycl::aspect::emulated)) + std::cout << "\t* emulated\n"; - case sycl::aspect::host_debugabble: - std::cout << "\t* debugabble using standard debuggers\n"; - break; + if(device.has(sycl::aspect::host_debuggable)) + std::cout << "\t* debuggable using standard debuggers\n"; +# endif +# endif - case sycl::aspect::fp16: - std::cout << "\t* supports sycl::half precision\n"; - break; + if(device.has(sycl::aspect::fp16)) + std::cout << "\t* supports sycl::half precision\n"; - case sycl::aspect::fp64: - std::cout << "\t* supports double precision\n"; - break; + if(device.has(sycl::aspect::fp64)) + std::cout << "\t* supports double precision\n"; - case sycl::aspect::atomic64: - std::cout << "\t* supports 64-bit atomics\n"; - break; + if(device.has(sycl::aspect::atomic64)) + std::cout << "\t* supports 64-bit atomics\n"; - case sycl::aspect::image: - std::cout << "\t* supports images\n"; - break; + if(device.has(sycl::aspect::image)) + std::cout << "\t* supports images\n"; - case sycl::aspect::online_compiler: - std::cout << "\t* supports online compilation of device code\n"; - break; + if(device.has(sycl::aspect::online_compiler)) + std::cout << "\t* supports online compilation of device code\n"; - case sycl::aspect::online_linker: - std::cout << "\t* supports online linking of device code\n"; - break; + if(device.has(sycl::aspect::online_linker)) + std::cout << "\t* supports online linking of device code\n"; - case sycl::aspect::queue_profiling: - std::cout << "\t* supports queue profiling\n"; - break; + if(device.has(sycl::aspect::queue_profiling)) + std::cout << "\t* supports queue profiling\n"; - case sycl::aspect::usm_device_allocations: - std::cout << "\t* supports explicit USM allocations\n"; - break; + if(device.has(sycl::aspect::usm_device_allocations)) + std::cout << "\t* supports explicit USM allocations\n"; - case sycl::aspect::usm_host_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n"; - break; + if(device.has(sycl::aspect::usm_host_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host\n"; - case sycl::aspect::usm_atomic_host_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n"; - break; + if(device.has(sycl::aspect::usm_atomic_host_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::host atomically\n"; - case sycl::aspect::usm_shared_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n"; - break; + if(device.has(sycl::aspect::usm_shared_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared\n"; - case sycl::aspect::usm_atomic_shared_allocations: - std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n"; - break; + if(device.has(sycl::aspect::usm_atomic_shared_allocations)) + std::cout << "\t* can access USM memory allocated by sycl::usm::alloc::shared atomically\n"; - case sycl::aspect::usm_system_allocations: - std::cout << "\t* can access memory allocated by the system allocator\n"; - break; - } - } + if(device.has(sycl::aspect::usm_system_allocations)) + std::cout << "\t* can access memory allocated by the system allocator\n"; std::cout << "Available compute units: " << device.get_info() << '\n'; @@ -323,7 +303,7 @@ namespace alpaka::trait std::cout << "Native ISA vector width (float): " << device.get_info() << '\n'; - if(device.has_aspect(sycl::aspect::fp64)) + if(device.has(sycl::aspect::fp64)) { std::cout << "Preferred native vector width (double): " << device.get_info() << '\n'; @@ -332,7 +312,7 @@ namespace alpaka::trait << device.get_info() << '\n'; } - if(device.has_aspect(sycl::aspect::fp16)) + if(device.has(sycl::aspect::fp16)) { std::cout << "Preferred native vector width (half): " << device.get_info() << '\n'; @@ -349,7 +329,7 @@ namespace alpaka::trait std::cout << "Maximum size of memory object allocation: " << device.get_info() << " bytes\n"; - if(device.has_aspect(sycl::aspect::image)) + if(device.has(sycl::aspect::image)) { std::cout << "Maximum number of simultaneous image object reads per kernel: " << device.get_info() << '\n'; @@ -417,7 +397,7 @@ namespace alpaka::trait find_and_print(sycl::info::fp_config::soft_float); }; - if(device.has_aspect(sycl::aspect::fp16)) + if(device.has(sycl::aspect::fp16)) { auto const fp16_conf = device.get_info(); print_fp_config("Half", fp16_conf); @@ -426,7 +406,7 @@ namespace alpaka::trait auto const fp32_conf = device.get_info(); print_fp_config("Single", fp32_conf); - if(device.has_aspect(sycl::aspect::fp64)) + if(device.has(sycl::aspect::fp64)) { auto const fp64_conf = device.get_info(); print_fp_config("Double", fp64_conf); @@ -458,7 +438,7 @@ namespace alpaka::trait << device.get_info() << " bytes\n"; std::cout << "Global memory cache size: " - << device.get_info() / KiB << " KiB\n" + << device.get_info() / KiB << " KiB\n"; } std::cout << "Global memory size: " << device.get_info() / MiB @@ -516,6 +496,11 @@ namespace alpaka::trait case sycl::memory_order::seq_cst: std::cout << "seq_cst"; break; +# if defined(BOOST_COMP_ICPX) + // Stop icpx from complaining about its own internals. + case sycl::memory_order::__consume_unsupported: + break; +# endif } std::cout << ", "; } @@ -526,9 +511,14 @@ namespace alpaka::trait auto const mem_orders = device.get_info(); print_memory_orders(mem_orders); +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // Not implemented in oneAPI < 2023.2.0 std::cout << "Supported memory orderings for sycl::atomic_fence: "; auto const fence_orders = device.get_info(); print_memory_orders(fence_orders); +# endif +# endif auto print_memory_scopes = [](std::vector const& mem_scopes) { @@ -565,9 +555,14 @@ namespace alpaka::trait auto const mem_scopes = device.get_info(); print_memory_scopes(mem_scopes); +# if defined(BOOST_COMP_ICPX) +# if BOOST_COMP_ICPX >= BOOST_VERSION_NUMBER(53, 2, 0) + // Not implemented in oneAPI < 2023.2.0 std::cout << "Supported memory scopes for sycl::atomic_fence: "; auto const fence_scopes = device.get_info(); print_memory_scopes(fence_scopes); +# endif +# endif std::cout << "Device timer resolution: " << device.get_info() << " ns\n"; @@ -607,6 +602,11 @@ namespace alpaka::trait std::cout << "by affinity domain"; has_affinity_domains = true; break; +# if defined(BOOST_COMP_ICPX) + case sycl::info::partition_property::ext_intel_partition_by_cslice: + std::cout << "by compute slice (Intel extension; deprecated)"; + break; +# endif } std::cout << ", "; } @@ -671,6 +671,12 @@ namespace alpaka::trait case sycl::info::partition_property::partition_by_affinity_domain: std::cout << "partitioned by affinity domain"; break; + +# if defined(BOOST_COMP_ICPX) + case sycl::info::partition_property::ext_intel_partition_by_cslice: + std::cout << "partitioned by compute slice (Intel extension; deprecated)"; + break; +# endif } std::cout << '\n'; diff --git a/include/alpaka/pltf/PltfGpuSyclIntel.hpp b/include/alpaka/pltf/PltfGpuSyclIntel.hpp index 4c54e6c6ec1c..9c0e6db1eb97 100644 --- a/include/alpaka/pltf/PltfGpuSyclIntel.hpp +++ b/include/alpaka/pltf/PltfGpuSyclIntel.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Jan Stephan, Andrea Bocci +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -12,19 +12,13 @@ #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_GPU) -# include +# include namespace alpaka { namespace detail { - // Prevent clang from annoying us with warnings about emitting too many vtables. These are discarded by the - // linker anyway. -# if BOOST_COMP_CLANG -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wweak-vtables" -# endif - struct IntelGpuSelector final + struct IntelGpuSelector { auto operator()(sycl::device const& dev) const -> int { @@ -34,9 +28,6 @@ namespace alpaka return is_intel_gpu ? 1 : -1; } }; -# if BOOST_COMP_CLANG -# pragma clang diagnostic pop -# endif } // namespace detail //! The SYCL device manager. diff --git a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp b/include/alpaka/queue/QueueCpuSyclBlocking.hpp similarity index 59% rename from include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp rename to include/alpaka/queue/QueueCpuSyclBlocking.hpp index 844375913d95..3d561733ecc5 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclBlocking.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/queue/QueueGenericSyclBlocking.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using QueueCpuSyclIntelBlocking = QueueGenericSyclBlocking; + using QueueCpuSyclBlocking = QueueGenericSyclBlocking; } #endif diff --git a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp similarity index 59% rename from include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp rename to include/alpaka/queue/QueueCpuSyclNonBlocking.hpp index 77f20c1acbba..c75f5be45229 100644 --- a/include/alpaka/queue/QueueCpuSyclIntelNonBlocking.hpp +++ b/include/alpaka/queue/QueueCpuSyclNonBlocking.hpp @@ -1,17 +1,17 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ #pragma once -#include "alpaka/dev/DevCpuSyclIntel.hpp" +#include "alpaka/dev/DevCpuSycl.hpp" #include "alpaka/queue/QueueGenericSyclNonBlocking.hpp" #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_ONEAPI_CPU) namespace alpaka { - using QueueCpuSyclIntelNonBlocking = QueueGenericSyclNonBlocking; + using QueueCpuSyclNonBlocking = QueueGenericSyclNonBlocking; } #endif diff --git a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp index 297b29732f24..9b08f3de55ca 100644 --- a/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp +++ b/include/alpaka/queue/sycl/QueueGenericSyclBase.hpp @@ -1,13 +1,13 @@ -/* Copyright 2022 Jan Stephan, Antonio Di Pilato +/* Copyright 2023 Jan Stephan, Antonio Di Pilato, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ #pragma once - #include "alpaka/dev/Traits.hpp" #include "alpaka/event/Traits.hpp" #include "alpaka/queue/Traits.hpp" +#include "alpaka/traits/Traits.hpp" #include "alpaka/wait/Traits.hpp" #include @@ -21,7 +21,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::detail { @@ -126,19 +126,24 @@ namespace alpaka::detail clean_dependencies(); // Execute task - m_last_event = m_queue.submit( - [this, &task](sycl::handler& cgh) - { - if(!m_dependencies.empty()) - cgh.depends_on(m_dependencies); - - if constexpr(is_sycl_kernel) // Kernel - task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally - else if constexpr(is_sycl_task) // Copy / Fill - task(cgh); // Will call cgh.{copy, fill} internally - else // Host - cgh.host_task(task); - }); + if constexpr(is_sycl_task && !is_sycl_kernel) // Copy / Fill + { + m_last_event = task(m_queue, m_dependencies); // Will call queue.{copy, fill} internally + } + else + { + m_last_event = m_queue.submit( + [this, &task](sycl::handler& cgh) + { + if(!m_dependencies.empty()) + cgh.depends_on(m_dependencies); + + if constexpr(is_sycl_kernel) // Kernel + task(cgh, m_fence_dummy); // Will call cgh.parallel_for internally + else // Host + cgh.host_task(task); + }); + } m_dependencies.clear(); } @@ -167,16 +172,16 @@ namespace alpaka::detail public: QueueGenericSyclBase(TDev const& dev) : m_dev{dev} - , m_impl{std::make_shared( + , m_spQueueImpl{std::make_shared( dev.getNativeHandle().second, dev.getNativeHandle().first)} { - m_dev.m_impl->register_queue(m_impl); + m_dev.m_impl->register_queue(m_spQueueImpl); } friend auto operator==(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool { - return (lhs.m_dev == rhs.m_dev) && (lhs.m_impl == rhs.m_impl); + return (lhs.m_dev == rhs.m_dev) && (lhs.m_spQueueImpl == rhs.m_spQueueImpl); } friend auto operator!=(QueueGenericSyclBase const& lhs, QueueGenericSyclBase const& rhs) -> bool @@ -186,11 +191,11 @@ namespace alpaka::detail [[nodiscard]] auto getNativeHandle() const noexcept { - return m_impl->getNativeHandle(); + return m_spQueueImpl->getNativeHandle(); } TDev m_dev; - std::shared_ptr m_impl; + std::shared_ptr m_spQueueImpl; }; } // namespace alpaka::detail @@ -234,7 +239,7 @@ namespace alpaka::trait static auto enqueue(detail::QueueGenericSyclBase& queue, TTask const& task) -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - queue.m_impl->template enqueue(task); + queue.m_spQueueImpl->template enqueue(task); } }; @@ -245,7 +250,7 @@ namespace alpaka::trait static auto empty(detail::QueueGenericSyclBase const& queue) -> bool { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - return queue.m_impl->empty(); + return queue.m_spQueueImpl->empty(); } }; @@ -259,7 +264,7 @@ namespace alpaka::trait static auto currentThreadWaitFor(detail::QueueGenericSyclBase const& queue) -> void { ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; - queue.m_impl->wait(); + queue.m_spQueueImpl->wait(); } }; diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp new file mode 100644 index 000000000000..6a85a3a79a5a --- /dev/null +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -0,0 +1,198 @@ +/* Copyright 2023 Luca Ferragina, Aurora Perego, Jan Stephan, Andrea Bocci + * SPDX-License-Identifier: MPL-2.0 + */ + +#pragma once + +#include "alpaka/core/BoostPredef.hpp" +#include "alpaka/core/Concepts.hpp" +#include "alpaka/dev/DevGenericSycl.hpp" +#include "alpaka/rand/Traits.hpp" + +#ifdef ALPAKA_ACC_SYCL_ENABLED + +// Backend specific imports. +# include +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wcast-qual" +# pragma clang diagnostic ignored "-Wextra-semi" +# pragma clang diagnostic ignored "-Wfloat-equal" +# pragma clang diagnostic ignored "-Wold-style-cast" +# pragma clang diagnostic ignored "-Wreserved-identifier" +# pragma clang diagnostic ignored "-Wreserved-macro-identifier" +# pragma clang diagnostic ignored "-Wsign-compare" +# pragma clang diagnostic ignored "-Wundef" +# endif +# include + +# include +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif + +# include + +namespace alpaka::rand +{ + //! The SYCL rand implementation. + template + struct RandGenericSycl : concepts::Implements> + { + explicit RandGenericSycl(sycl::nd_item my_item) : m_item_rand{my_item} + { + } + + sycl::nd_item m_item_rand; + }; + +# if !defined(ALPAKA_HOST_ONLY) + namespace distribution::sycl_rand + { + //! The SYCL random number floating point normal distribution. + template + struct NormalReal; + + //! The SYCL random number uniform distribution. + template + struct Uniform; + } // namespace distribution::sycl_rand + + namespace engine::sycl_rand + { + //! The SYCL linear congruential random number generator engine. + template + class Minstd + { + public: + // After calling this constructor the instance is not valid initialized and + // need to be overwritten with a valid object + Minstd() = default; + + Minstd(RandGenericSycl rand, std::uint32_t const& seed) + { + oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id()); + rng_engine = engine; + } + + private: + template + friend struct distribution::sycl_rand::NormalReal; + template + friend struct distribution::sycl_rand::Uniform; + + oneapi::dpl::minstd_rand rng_engine; + + public: + using result_type = float; + + ALPAKA_FN_HOST_ACC static result_type min() + { + return std::numeric_limits::min(); + } + ALPAKA_FN_HOST_ACC static result_type max() + { + return std::numeric_limits::max(); + } + result_type operator()() + { + oneapi::dpl::uniform_real_distribution distr; + return distr(rng_engine); + } + }; + } // namespace engine::sycl_rand + + namespace distribution::sycl_rand + { + + //! The SYCL random number double normal distribution. + template + struct NormalReal + { + static_assert(std::is_floating_point_v); + + template + auto operator()(TEngine& engine) -> F + { + oneapi::dpl::normal_distribution distr; + return distr(engine.rng_engine); + } + }; + + //! The SYCL random number float uniform distribution. + template + struct Uniform + { + static_assert(std::is_floating_point_v || std::is_unsigned_v); + + template + auto operator()(TEngine& engine) -> T + { + if constexpr(std::is_floating_point_v) + { + oneapi::dpl::uniform_real_distribution distr; + return distr(engine.rng_engine); + } + else + { + oneapi::dpl::uniform_int_distribution distr; + return distr(engine.rng_engine); + } + } + }; + } // namespace distribution::sycl_rand + + namespace distribution::trait + { + //! The SYCL random number float normal distribution get trait specialization. + template + struct CreateNormalReal, T, std::enable_if_t>> + { + static auto createNormalReal(RandGenericSycl const& /*rand*/) -> sycl_rand::NormalReal + { + return {}; + } + }; + + //! The SYCL random number float uniform distribution get trait specialization. + template + struct CreateUniformReal, T, std::enable_if_t>> + { + static auto createUniformReal(RandGenericSycl const& /*rand*/) -> sycl_rand::Uniform + { + return {}; + } + }; + + //! The SYCL random number integer uniform distribution get trait specialization. + template + struct CreateUniformUint, T, std::enable_if_t>> + { + static auto createUniformUint(RandGenericSycl const& /*rand*/) -> sycl_rand::Uniform + { + return {}; + } + }; + } // namespace distribution::trait + + namespace engine::trait + { + //! The SYCL random number default generator get trait specialization. + template + struct CreateDefault> + { + static auto createDefault( + RandGenericSycl const& rand, + std::uint32_t const& seed = 0, + std::uint32_t const& /* subsequence */ = 0, + std::uint32_t const& /* offset */ = 0) -> sycl_rand::Minstd + { + return {rand, seed}; + } + }; + } // namespace engine::trait +# endif +} // namespace alpaka::rand + +#endif diff --git a/include/alpaka/standalone/CpuSyclIntel.hpp b/include/alpaka/standalone/CpuSycl.hpp similarity index 87% rename from include/alpaka/standalone/CpuSyclIntel.hpp rename to include/alpaka/standalone/CpuSycl.hpp index e42b64a2097e..fbdb5c2c481e 100644 --- a/include/alpaka/standalone/CpuSyclIntel.hpp +++ b/include/alpaka/standalone/CpuSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/Check.hpp b/include/alpaka/test/Check.hpp index 253cb477fe9b..39545e764923 100644 --- a/include/alpaka/test/Check.hpp +++ b/include/alpaka/test/Check.hpp @@ -1,31 +1,19 @@ -/* Copyright 2022 Benjamin Worpitz, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Jan Stephan, Luca Ferragina, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ #pragma once +#include "alpaka/core/Sycl.hpp" + #include -// TODO: SYCL doesn't have a way to detect if we're looking at device or host code. This needs a workaround so that -// SYCL and other back-ends are compatible. -#ifdef ALPAKA_ACC_SYCL_ENABLED -# define ALPAKA_CHECK(success, expression) \ - do \ - { \ - if(!(expression)) \ - { \ - acc.cout << "ALPAKA_CHECK failed because '!(" << #expression << ")'\n"; \ - success = false; \ - } \ - } while(0) -#else -# define ALPAKA_CHECK(success, expression) \ - do \ +#define ALPAKA_CHECK(success, expression) \ + do \ + { \ + if(!(expression)) \ { \ - if(!(expression)) \ - { \ - printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \ - success = false; \ - } \ - } while(0) -#endif + printf("ALPAKA_CHECK failed because '!(%s)'\n", #expression); \ + success = false; \ + } \ + } while(0) diff --git a/include/alpaka/test/acc/TestAccs.hpp b/include/alpaka/test/acc/TestAccs.hpp index 742b40fd8313..62c1e6f7d2c2 100644 --- a/include/alpaka/test/acc/TestAccs.hpp +++ b/include/alpaka/test/acc/TestAccs.hpp @@ -79,10 +79,10 @@ namespace alpaka::test #endif #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_CPU) template - using AccCpuSyclIntelIfAvailableElseInt = alpaka::AccCpuSyclIntel; + using AccCpuSyclIfAvailableElseInt = alpaka::AccCpuSycl; #else template - using AccCpuSyclIntelIfAvailableElseInt = int; + using AccCpuSyclIfAvailableElseInt = int; #endif #if defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_BACKEND_ONEAPI) && defined(ALPAKA_SYCL_TARGET_FPGA) template @@ -109,7 +109,7 @@ namespace alpaka::test AccCpuOmp2ThreadsIfAvailableElseInt, AccGpuCudaRtIfAvailableElseInt, AccGpuHipRtIfAvailableElseInt, - AccCpuSyclIntelIfAvailableElseInt, + AccCpuSyclIfAvailableElseInt, AccFpgaSyclIntelIfAvailableElseInt, AccGpuSyclIntelIfAvailableElseInt>; } // namespace detail diff --git a/include/alpaka/test/dim/TestDims.hpp b/include/alpaka/test/dim/TestDims.hpp index 6350697fe7b3..395c97e5dcf3 100644 --- a/include/alpaka/test/dim/TestDims.hpp +++ b/include/alpaka/test/dim/TestDims.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber +/* Copyright 2023 Benjamin Worpitz, Andrea Bocci, Jan Stephan, Bernhard Manfred Gruber * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/event/EventHostManualTrigger.hpp b/include/alpaka/test/event/EventHostManualTrigger.hpp index d465555c0dc9..7016f45e6c45 100644 --- a/include/alpaka/test/event/EventHostManualTrigger.hpp +++ b/include/alpaka/test/event/EventHostManualTrigger.hpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Jan Stephan, Jeffrey Kelling, Andrea Bocci, + * Bernhard Manfred Gruber * SPDX-License-Identifier: MPL-2.0 */ diff --git a/include/alpaka/test/queue/Queue.hpp b/include/alpaka/test/queue/Queue.hpp index 07d8495da8df..7cb3492cfc45 100644 --- a/include/alpaka/test/queue/Queue.hpp +++ b/include/alpaka/test/queue/Queue.hpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Benjamin Worpitz, Matthias Werner, René Widera, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -86,23 +86,23 @@ namespace alpaka::test # ifdef ALPAKA_SYCL_ONEAPI_CPU //! The default queue type trait specialization for the Intel CPU device. template<> - struct DefaultQueueType + struct DefaultQueueType { # if(ALPAKA_DEBUG >= ALPAKA_DEBUG_FULL) - using type = alpaka::QueueCpuSyclIntelBlocking; + using type = alpaka::QueueCpuSyclBlocking; # else - using type = alpaka::QueueCpuSyclIntelNonBlocking; + using type = alpaka::QueueCpuSyclNonBlocking; # endif }; template<> - struct IsBlockingQueue + struct IsBlockingQueue { static constexpr auto value = true; }; template<> - struct IsBlockingQueue + struct IsBlockingQueue { static constexpr auto value = false; }; @@ -180,8 +180,8 @@ namespace alpaka::test # ifdef ALPAKA_SYCL_BACKEND_ONEAPI # ifdef ALPAKA_SYCL_ONEAPI_CPU , - std::tuple, - std::tuple + std::tuple, + std::tuple # endif # ifdef ALPAKA_SYCL_ONEAPI_FPGA , diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 33ac1884deb2..b2b6aa8328b5 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -10,7 +10,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka::warp { @@ -19,11 +19,11 @@ namespace alpaka::warp class WarpGenericSycl : public concepts::Implements> { public: - WarpGenericSycl(sycl::nd_item my_item) : m_item{my_item} + WarpGenericSycl(sycl::nd_item my_item) : m_item_warp{my_item} { } - sycl::nd_item m_item; + sycl::nd_item m_item_warp; }; } // namespace alpaka::warp @@ -34,21 +34,30 @@ namespace alpaka::warp::trait { static auto getSize(warp::WarpGenericSycl const& warp) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); // SYCL sub-groups are always 1D - return static_cast(sub_group.get_local_linear_range()); + return static_cast(sub_group.get_max_local_range()[0]); } }; template struct Activemask> { + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. static auto activemask(warp::WarpGenericSycl const& warp) -> std::uint32_t { // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code // regions anyway we return the full mask. - auto const sub_group = warp.m_item.get_sub_group(); - return sycl::ext::oneapi::group_ballot(sub_group, true); + auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true); + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. + std::uint32_t bits = 0; + mask.extract_bits(bits); + return bits; } }; @@ -57,7 +66,7 @@ namespace alpaka::warp::trait { static auto all(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); return static_cast(sycl::all_of_group(sub_group, static_cast(predicate))); } }; @@ -67,7 +76,7 @@ namespace alpaka::warp::trait { static auto any(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item.get_sub_group(); + auto const sub_group = warp.m_item_warp.get_sub_group(); return static_cast(sycl::any_of_group(sub_group, static_cast(predicate))); } }; @@ -75,10 +84,19 @@ namespace alpaka::warp::trait template struct Ballot> { - static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. + static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::uint32_t { - auto const sub_group = warp.m_item.get_sub_group(); - return sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); + auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); + // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, + // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. + // Restrict to warpSize <= 32 for now. + std::uint32_t bits = 0; + mask.extract_bits(bits); + return bits; } }; @@ -88,21 +106,20 @@ namespace alpaka::warp::trait template static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) { + ALPAKA_ASSERT_OFFLOAD(width > 0); + ALPAKA_ASSERT_OFFLOAD(srcLane < width); + ALPAKA_ASSERT_OFFLOAD(srcLane >= 0); + /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each subdivision has the assumed index 0. The srcLane index is relative to the subdivisions. Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions: The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */ - auto const actual_group = warp.m_item.get_sub_group(); - auto const actual_item_id = actual_group.get_local_linear_id(); - - auto const assumed_group_id = actual_item_id / width; - auto const assumed_item_id = actual_item_id % width; - - auto const assumed_src_id = static_cast(srcLane % width); - auto const actual_src_id = assumed_src_id + assumed_group_id * width; - + auto const actual_group = warp.m_item_warp.get_sub_group(); + auto const actual_item_id = static_cast(actual_group.get_local_linear_id()); + auto const actual_group_id = actual_item_id / width; + auto const actual_src_id = static_cast(srcLane + actual_group_id * width); auto const src = sycl::id<1>{actual_src_id}; return sycl::select_from_group(actual_group, value, src); diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index c6d1c114a3c9..26e00750e42d 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Jan Stephan +/* Copyright 2023 Jan Stephan, Luca Ferragina, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -10,7 +10,7 @@ #ifdef ALPAKA_ACC_SYCL_ENABLED -# include +# include namespace alpaka { @@ -18,17 +18,19 @@ namespace alpaka template class WorkDivGenericSycl : public concepts::Implements> { + static_assert(TDim::value > 0, "The SYCL work division must have a dimension greater than zero."); + public: using WorkDivBase = WorkDivGenericSycl; WorkDivGenericSycl(Vec const& threadElemExtent, sycl::nd_item work_item) : m_threadElemExtent{threadElemExtent} - , my_item{work_item} + , m_item_workdiv{work_item} { } Vec const& m_threadElemExtent; - sycl::nd_item my_item; + sycl::nd_item m_item_workdiv; }; } // namespace alpaka @@ -55,20 +57,22 @@ namespace alpaka::trait //! \return The number of blocks in each dimension of the grid. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { - if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.my_item.get_group_range(0))}; + if constexpr(TDim::value == 0) + return Vec{}; + else if constexpr(TDim::value == 1) + return Vec{static_cast(workDiv.m_item_workdiv.get_group_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.my_item.get_group_range(1)), - static_cast(workDiv.my_item.get_group_range(0))}; + static_cast(workDiv.m_item_workdiv.get_group_range(1)), + static_cast(workDiv.m_item_workdiv.get_group_range(0))}; } else { return Vec{ - static_cast(workDiv.my_item.get_group_range(2)), - static_cast(workDiv.my_item.get_group_range(1)), - static_cast(workDiv.my_item.get_group_range(0))}; + static_cast(workDiv.m_item_workdiv.get_group_range(2)), + static_cast(workDiv.m_item_workdiv.get_group_range(1)), + static_cast(workDiv.m_item_workdiv.get_group_range(0))}; } } }; @@ -80,20 +84,22 @@ namespace alpaka::trait //! \return The number of threads in each dimension of a block. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { - if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.my_item.get_local_range(0))}; + if constexpr(TDim::value == 0) + return Vec{}; + else if constexpr(TDim::value == 1) + return Vec{static_cast(workDiv.m_item_workdiv.get_local_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.my_item.get_local_range(1)), - static_cast(workDiv.my_item.get_local_range(0))}; + static_cast(workDiv.m_item_workdiv.get_local_range(1)), + static_cast(workDiv.m_item_workdiv.get_local_range(0))}; } else { return Vec{ - static_cast(workDiv.my_item.get_local_range(2)), - static_cast(workDiv.my_item.get_local_range(1)), - static_cast(workDiv.my_item.get_local_range(0))}; + static_cast(workDiv.m_item_workdiv.get_local_range(2)), + static_cast(workDiv.m_item_workdiv.get_local_range(1)), + static_cast(workDiv.m_item_workdiv.get_local_range(0))}; } } }; @@ -102,7 +108,7 @@ namespace alpaka::trait template struct GetWorkDiv, origin::Thread, unit::Elems> { - //! \return The number of blocks in each dimension of the grid. + //! \return The number of elements in each dimension of the thread. static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec { return workDiv.m_threadElemExtent; diff --git a/test/unit/acc/src/AccTagTest.cpp b/test/unit/acc/src/AccTagTest.cpp index f2d5f58ee45e..741f2e17385b 100644 --- a/test/unit/acc/src/AccTagTest.cpp +++ b/test/unit/acc/src/AccTagTest.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Simeon Ehrig, Jan Stephan +/* Copyright 2023 Simeon Ehrig, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -23,7 +23,7 @@ using TagList = std::tuple< alpaka::TagCpuOmp2Threads, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, - alpaka::TagCpuSyclIntel, + alpaka::TagCpuSycl, alpaka::TagFpgaSyclIntel, alpaka::TagGpuSyclIntel>; @@ -35,7 +35,7 @@ using AccToTagMap = std::tuple< std::pair, alpaka::TagCpuOmp2Threads>, std::pair, alpaka::TagGpuCudaRt>, std::pair, alpaka::TagGpuHipRt>, - std::pair, alpaka::TagCpuSyclIntel>, + std::pair, alpaka::TagCpuSycl>, std::pair, alpaka::TagFpgaSyclIntel>, std::pair, alpaka::TagGpuSyclIntel>>; diff --git a/test/unit/atomic/src/AtomicTest.cpp b/test/unit/atomic/src/AtomicTest.cpp index b04fd281af6c..467dab651741 100644 --- a/test/unit/atomic/src/AtomicTest.cpp +++ b/test/unit/atomic/src/AtomicTest.cpp @@ -1,4 +1,5 @@ -/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Jan Stephan, Bernhard Manfred Gruber, Antonio Di Pilato +/* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, Sergei Bastrakov, René Widera, Jan Stephan, + * Bernhard Manfred Gruber, Antonio Di Pilato, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ diff --git a/test/unit/mem/view/src/ViewStaticAccMem.cpp b/test/unit/mem/view/src/ViewStaticAccMem.cpp index f26a88fdd8a0..ff8f0e48b245 100644 --- a/test/unit/mem/view/src/ViewStaticAccMem.cpp +++ b/test/unit/mem/view/src/ViewStaticAccMem.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Axel Huebl, Benjamin Worpitz, Matthias Werner, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -16,6 +16,8 @@ using Elem = std::uint32_t; using Dim = alpaka::DimInt<2u>; using Idx = std::uint32_t; +#if !defined(ALPAKA_ACC_SYCL_ENABLED) + // These forward declarations are only necessary when you want to access those variables // from a different compilation unit and should be moved to a common header. // Here they are used to silence clang`s -Wmissing-variable-declarations warning @@ -40,10 +42,13 @@ struct StaticDeviceMemoryTestKernel } }; +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) + using TestAccs = alpaka::test::EnabledAccs; TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAccs) { +#if !defined(ALPAKA_ACC_SYCL_ENABLED) using Acc = TestType; using DevAcc = alpaka::Dev; @@ -75,8 +80,16 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc REQUIRE(fixture(kernel, alpaka::getPtrNative(viewConstantMemUninitialized))); } + +#else // !defined(ALPAKA_ACC_SYCL_ENABLED) + + WARN("The SYCL backend does not support global device variables."); + +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) } +#if !defined(ALPAKA_ACC_SYCL_ENABLED) + // These forward declarations are only necessary when you want to access those variables // from a different compilation unit and should be moved to a common header. // Here they are used to silence clang`s -Wmissing-variable-declarations warning @@ -84,8 +97,11 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryGlobal", "[viewStaticAccMem]", TestAc extern ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2]; ALPAKA_STATIC_ACC_MEM_GLOBAL Elem g_globalMemory2DUninitialized[3][2]; +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) + TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", TestAccs) { +#if !defined(ALPAKA_ACC_SYCL_ENABLED) using Acc = TestType; using DevAcc = alpaka::Dev; @@ -117,4 +133,10 @@ TEMPLATE_LIST_TEST_CASE("staticDeviceMemoryConstant", "[viewStaticAccMem]", Test REQUIRE(fixture(kernel, alpaka::getPtrNative(viewGlobalMemUninitialized))); } + +#else // !defined(ALPAKA_ACC_SYCL_ENABLED) + + WARN("The SYCL backend does not support global device constants."); + +#endif // !defined(ALPAKA_ACC_SYCL_ENABLED) } diff --git a/test/unit/rand/src/RandTest.cpp b/test/unit/rand/src/RandTest.cpp index 7bea6e930ea6..05311d438965 100644 --- a/test/unit/rand/src/RandTest.cpp +++ b/test/unit/rand/src/RandTest.cpp @@ -1,5 +1,5 @@ /* Copyright 2023 Axel Hübl, Benjamin Worpitz, Matthias Werner, René Widera, Jan Stephan, Bernhard Manfred Gruber, - * Sergei Bastrakov + * Sergei Bastrakov, Andrea Bocci * SPDX-License-Identifier: MPL-2.0 */ @@ -60,7 +60,7 @@ class RandTestKernel auto genDefault = alpaka::rand::engine::createDefault(acc, 12345u, 6789u); genNumbers(acc, success, genDefault); -#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) +#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !defined(ALPAKA_ACC_SYCL_ENABLED) // TODO: These ifdefs are wrong: They will reduce the test to the // smallest common denominator from all enabled backends // std::random_device diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp index afc492ffc9a7..8038606a4116 100644 --- a/test/unit/warp/src/Activemask.cpp +++ b/test/unit/warp/src/Activemask.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct ActivemaskSingleThreadWarpTestKernel } }; +template struct ActivemaskMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -53,6 +54,12 @@ struct ActivemaskMultipleThreadWarpTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -68,7 +75,7 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) if(scalar) { alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); - REQUIRE(fixture(ActivemaskSingleThreadWarpTestKernel{})); + CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{})); } else { @@ -80,9 +87,41 @@ TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - ActivemaskMultipleThreadWarpTestKernel kernel; - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) - REQUIRE(fixture(kernel, inactiveThreadIdx)); + if(warpExtent == 4) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 8) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 16) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 32) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 64) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx)); + } + } } } } diff --git a/test/unit/warp/src/All.cpp b/test/unit/warp/src/All.cpp index 4ea233e3179b..3a61553496be 100644 --- a/test/unit/warp/src/All.cpp +++ b/test/unit/warp/src/All.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct AllSingleThreadWarpTestKernel } }; +template struct AllMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -57,6 +58,12 @@ struct AllMultipleThreadWarpTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -84,8 +91,26 @@ TEMPLATE_LIST_TEST_CASE("all", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - AllMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 4) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(AllMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/Any.cpp b/test/unit/warp/src/Any.cpp index 72b247ecfe63..0f5a059d1c48 100644 --- a/test/unit/warp/src/Any.cpp +++ b/test/unit/warp/src/Any.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -25,6 +25,7 @@ struct AnySingleThreadWarpTestKernel } }; +template struct AnyMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -57,6 +58,12 @@ struct AnyMultipleThreadWarpTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -84,8 +91,26 @@ TEMPLATE_LIST_TEST_CASE("any", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - AnyMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 4) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(AnyMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/Ballot.cpp b/test/unit/warp/src/Ballot.cpp index cf5b59d79445..0525928c275c 100644 --- a/test/unit/warp/src/Ballot.cpp +++ b/test/unit/warp/src/Ballot.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2023 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -26,6 +26,7 @@ struct BallotSingleThreadWarpTestKernel } }; +template struct BallotMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -65,6 +66,12 @@ struct BallotMultipleThreadWarpTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -92,8 +99,26 @@ TEMPLATE_LIST_TEST_CASE("ballot", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - BallotMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 4) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(BallotMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/test/unit/warp/src/GetSize.cpp b/test/unit/warp/src/GetSize.cpp index 5dbbaa018bba..d8c2b5ce3a84 100644 --- a/test/unit/warp/src/GetSize.cpp +++ b/test/unit/warp/src/GetSize.cpp @@ -1,4 +1,4 @@ -/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan +/* Copyright 2022 Sergei Bastrakov, Bernhard Manfred Gruber, Jan Stephan, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -12,6 +12,7 @@ #include +template struct GetSizeTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -22,6 +23,11 @@ struct GetSizeTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -37,6 +43,26 @@ TEMPLATE_LIST_TEST_CASE("getSize", "[warp]", alpaka::test::TestAccs) [](std::size_t ws) { alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(8)); - return fixture(GetSizeTestKernel{}, static_cast(ws)); + if(ws == 4) + { + return fixture(GetSizeTestKernel<4>{}, static_cast(ws)); + } + else if(ws == 8) + { + return fixture(GetSizeTestKernel<8>{}, static_cast(ws)); + } + else if(ws == 16) + { + return fixture(GetSizeTestKernel<16>{}, static_cast(ws)); + } + else if(ws == 32) + { + return fixture(GetSizeTestKernel<32>{}, static_cast(ws)); + } + else if(ws == 64) + { + return fixture(GetSizeTestKernel<64>{}, static_cast(ws)); + } + return fixture(GetSizeTestKernel<0>{}, static_cast(ws)); })); } diff --git a/test/unit/warp/src/Shfl.cpp b/test/unit/warp/src/Shfl.cpp index 63f5b81a5f77..4a35ba4480a3 100644 --- a/test/unit/warp/src/Shfl.cpp +++ b/test/unit/warp/src/Shfl.cpp @@ -1,4 +1,4 @@ -/* Copyright 2023 David M. Rogers, Jan Stephan +/* Copyright 2023 David M. Rogers, Jan Stephan, Andrea Bocci, Aurora Perego * SPDX-License-Identifier: MPL-2.0 */ @@ -35,6 +35,7 @@ struct ShflSingleThreadWarpTestKernel } }; +template struct ShflMultipleThreadWarpTestKernel { ALPAKA_NO_HOST_ACC_WARNING @@ -87,6 +88,12 @@ struct ShflMultipleThreadWarpTestKernel } }; +template +struct alpaka::trait::WarpSize, TAcc> + : std::integral_constant +{ +}; + TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; @@ -114,8 +121,26 @@ TEMPLATE_LIST_TEST_CASE("shfl", "[warp]", alpaka::test::TestAccs) auto const threadElementExtent = alpaka::Vec::ones(); auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; auto fixture = ExecutionFixture{workDiv}; - ShflMultipleThreadWarpTestKernel kernel; - REQUIRE(fixture(kernel)); + if(warpExtent == 4) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<4>{})); + } + else if(warpExtent == 8) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<8>{})); + } + else if(warpExtent == 16) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<16>{})); + } + else if(warpExtent == 32) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<32>{})); + } + else if(warpExtent == 64) + { + REQUIRE(fixture(ShflMultipleThreadWarpTestKernel<64>{})); + } } } } diff --git a/thirdParty/CMakeLists.txt b/thirdParty/CMakeLists.txt index b061c9b0678a..ae1347167e8e 100644 --- a/thirdParty/CMakeLists.txt +++ b/thirdParty/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright 2023 Jan Stephan +# Copyright 2023 Jan Stephan, Bernhard Manfred Gruber, Andrea Bocci # SPDX-License-Identifier: MPL-2.0 #