diff --git a/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h b/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h index 726f769f70a49..cc61a405897fb 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h +++ b/HeterogeneousCore/AlpakaInterface/interface/atomicMaxF.h @@ -1,13 +1,16 @@ -#ifndef HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h -#define HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h +#ifndef HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h +#define HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h + #include #include "FWCore/Utilities/interface/bit_cast.h" -#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) +// FIXME: this should be rewritten using the correct template specialisation for the different accelerator types + template >> -static __device__ __forceinline__ float atomicMaxF(const TAcc& acc, float* address, float val) { +ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* address, float val) { +#if defined(__CUDA_ARCH__) or defined(__HIP_DEVICE_COMPILE__) + // GPU implementation uses __float_as_int / __int_as_float int ret = __float_as_int(*address); while (val > __int_as_float(ret)) { int old = ret; @@ -15,10 +18,7 @@ static __device__ __forceinline__ float atomicMaxF(const TAcc& acc, float* addre break; } return __int_as_float(ret); -} #else -template >> -ALPAKA_FN_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* address, float val) { // CPU implementation uses edm::bit_cast int ret = edm::bit_cast(*address); while (val > edm::bit_cast(ret)) { @@ -27,7 +27,7 @@ ALPAKA_FN_ACC ALPAKA_FN_INLINE static float atomicMaxF(const TAcc& acc, float* a break; } return edm::bit_cast(ret); -} #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ +} -#endif // HeterogeneousCore_AlpakaCore_interface_atomicMaxF_h +#endif // HeterogeneousCore_AlpakaInterface_interface_atomicMaxF_h