From 06d5eefaffca64e9b99f3a88b3f88f2203fa3243 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Sun, 1 Oct 2023 17:28:22 +0200 Subject: [PATCH 01/10] Add stub functions and exports for omp_target_memset() --- openmp/libomptarget/include/omptarget.h | 1 + openmp/libomptarget/src/api.cpp | 11 +++++ openmp/libomptarget/src/exports | 2 + .../libomptarget/test/api/omp_target_memset.c | 45 +++++++++++++++++++ openmp/runtime/src/dllexports | 2 + openmp/runtime/src/include/omp.h.var | 5 +++ openmp/runtime/src/include/omp_lib.f90.var | 22 +++++++++ openmp/runtime/src/include/omp_lib.h.var | 22 +++++++++ openmp/runtime/src/kmp_ftn_os.h | 2 + 9 files changed, 112 insertions(+) create mode 100644 openmp/libomptarget/test/api/omp_target_memset.c diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index f87557a69eff27..e1f0f77849fa20 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize, const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice, int SrcDevice); +void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum); int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr, size_t Size, size_t DeviceOffset, int DeviceNum); int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum); diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 942df8fdb94d66..595b234d535c8e 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -241,6 +241,17 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { return Rc; } +EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) { + return nullptr; +} + +EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum, + int DepObjCount, + omp_depend_t * DepObjList) { + return nullptr; +} + + // Allocate and launch helper task static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args, int DepObjCount, diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports index c29c8d03fb1276..af882a26426472 100644 --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -44,6 +44,8 @@ VERS1.0 { omp_target_memcpy_rect; omp_target_memcpy_async; omp_target_memcpy_rect_async; + omp_target_memset; + omp_target_memset_async; omp_target_associate_ptr; omp_target_disassociate_ptr; llvm_omp_target_alloc_host; diff --git a/openmp/libomptarget/test/api/omp_target_memset.c b/openmp/libomptarget/test/api/omp_target_memset.c new file mode 100644 index 00000000000000..ea0c6d73fbdeef --- /dev/null +++ b/openmp/libomptarget/test/api/omp_target_memset.c @@ -0,0 +1,45 @@ +// RUN: %libomptarget-compile-and-run-generic + +#include "stdio.h" +#include +#include + +int main() { + int d = omp_get_default_device(); + int id = omp_get_initial_device(); + int q[128], i; + void *p; + void *result; + + if (d < 0 || d >= omp_get_num_devices()) + d = id; + + p = omp_target_alloc(130 * sizeof(int), d); + if (p == NULL) + return 0; + + for (i = 0; i < 128; i++) + q[i] = i; + + result = omp_target_memset(p, 0, 130 * sizeof(int), d); + if (result != p) { + abort(); + } + + int q2[128]; + for (i = 0; i < 128; ++i) + q2[i] = i; + if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d, + 0, NULL)) + abort(); + +#pragma omp taskwait + + for (i = 0; i < 128; ++i) + if (q2[i] != 0) + abort(); + + omp_target_free(p, d); + + return 0; +} diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index e69cf6670e8148..0d49643709e0a0 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -518,6 +518,8 @@ kmp_set_warnings_off 780 omp_target_memcpy_rect 887 omp_target_associate_ptr 888 omp_target_disassociate_ptr 889 + omp_target_memset 3000 + omp_target_memset_async 3001 %endif kmp_set_disp_num_buffers 890 diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index 1b2c467a2a12d8..f372402be37d39 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -236,6 +236,11 @@ extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *, const size_t *, const size_t *, const size_t *, const size_t *, int, int, int, omp_depend_t *); + + /* OpenMP 6.0 device memory routines */ + extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int); + extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *); + /*! * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device. */ diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var index c72287422809ae..1ca542db3767ef 100644 --- a/openmp/runtime/src/include/omp_lib.f90.var +++ b/openmp/runtime/src/include/omp_lib.f90.var @@ -635,6 +635,28 @@ integer (omp_depend_kind), optional :: depobj_list(*) end function omp_target_memcpy_rect_async + function omp_target_memset(ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) :: omp_target_memset + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + end function + + function omp_target_memset_async(ptr, val, count, device_num, & + depobj_count, depobj_list) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + use omp_lib_kinds + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + integer(c_int), value :: depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function + function omp_target_associate_ptr(host_ptr, device_ptr, size, & device_offset, device_num) bind(c) use omp_lib_kinds diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var index 9f5e58515e7515..d20aade6ef8b32 100644 --- a/openmp/runtime/src/include/omp_lib.h.var +++ b/openmp/runtime/src/include/omp_lib.h.var @@ -732,6 +732,28 @@ integer(omp_depend_kind), optional :: depobj_list(*) end function omp_target_memcpy_rect_async + function omp_target_memset(ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) :: omp_target_memset + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + end function + + function omp_target_memset_async(ptr, val, count, device_num, & + depobj_count, depobj_list) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + use omp_lib_kinds + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + integer(c_int), value :: depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function + function omp_target_associate_ptr(host_ptr, device_ptr, size, & & device_offset, device_num) bind(c) use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h index d37c9c86028eb1..7d595b947f4a9d 100644 --- a/openmp/runtime/src/kmp_ftn_os.h +++ b/openmp/runtime/src/kmp_ftn_os.h @@ -116,6 +116,8 @@ #define FTN_TARGET_IS_PRESENT omp_target_is_present #define FTN_TARGET_MEMCPY omp_target_memcpy #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect +#define FTN_TARGET_MEMSET omp_target_memset +#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr #endif From c1791d6b9509045b38f16b77959fdc0b4d5b7b78 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Mon, 2 Oct 2023 16:14:55 +0200 Subject: [PATCH 02/10] Implement a slow-path version of omp_target_memset*() There is a TODO to implement a fast path that uses an on-device kernel instead of the host-based memory fill operation. This may require some additional plumbing to have kernels in libomptarget.so --- openmp/libomptarget/src/api.cpp | 116 +++++++++++++++++++++++++++--- openmp/libomptarget/src/private.h | 13 ++++ 2 files changed, 121 insertions(+), 8 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 595b234d535c8e..c8675b698544f3 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -241,21 +241,121 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { return Rc; } +static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { + if (!Task) { + return OFFLOAD_FAIL; + } + + auto *Args = reinterpret_cast(Task->shareds); + if (!Args) { + return OFFLOAD_FAIL; + } + + // call omp_target_memset() + omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum); + + delete Args; + + return OFFLOAD_SUCCESS; +} + +static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, + int DepObjCount, + omp_depend_t * DepObjList) { + // Create global thread ID + int Gtid = __kmpc_global_thread_num(nullptr); + int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper; + + // Setup the hidden helper flags + kmp_int32 Flags = 0; + kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags; + InputFlags->hidden_helper = 1; + + // Alloc the helper task + kmp_task_t *Task = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags, + sizeof(kmp_task_t), 0, Fn, -1); + if (!Task) { + delete Args; + return OFFLOAD_FAIL; + } + + // Setup the arguments for the helper task + Task->shareds = Args; + + // Convert types of depend objects + llvm::SmallVector DepObjs; + for (int i = 0; i < DepObjCount; i++) { + omp_depend_t DepObj = DepObjList[i]; + DepObjs.push_back(*((kmp_depend_info_t *)DepObj)); + } + + // Launch the helper task + int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount, + DepObjs.data(), 0, nullptr); + + return Rc; +} + EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) { - return nullptr; + TIMESCOPE(); + DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n", + DeviceNum, Ptr, N); + + // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation + // of unspecified behavior, see OpenMP spec). + if (!Ptr || N == 0) { + return Ptr; + } + + if (DeviceNum == omp_get_initial_device()) { + DP("filling memory on host via memset"); + memset(Ptr, C, N); // ignore return value, memset() cannot fail + } + else { + // TODO: replace the omp_target_memset() slow path with the fast path. + // That will require the ability to execute a kernel from within + // libomptarget.so (which we do not have at the moment). + + // This is a very slow path: create a filled array on the host and upload + // it to the GPU device. + int InitialDevice = omp_get_initial_device(); + void * Shadow = omp_target_alloc(N, InitialDevice); + memset(Shadow, C, N); + omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice); + omp_target_free(Shadow, InitialDevice); + } + + DP("omp_target_memset returns %p\n", Ptr); + return Ptr; } + EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum, int DepObjCount, omp_depend_t * DepObjList) { - return nullptr; -} + DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu", + DeviceNum, Ptr, N); + + // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation + // of unspecified behavior, see OpenMP spec). + if (!Ptr || N == 0) { + return Ptr; + } + // Create the task object to deal with the async invocation + auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum}; + + // omp_target_memset_async() cannot fail via a return code, so ignore the + // return code of the helper function + (void) libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList); + + return Ptr; +} // Allocate and launch helper task -static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args, - int DepObjCount, - omp_depend_t *DepObjList) { +static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args, + int DepObjCount, + omp_depend_t *DepObjList) { // Create global thread ID int Gtid = __kmpc_global_thread_num(nullptr); int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper; @@ -313,7 +413,7 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice); // Create and launch helper task - int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList); + int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList); DP("omp_target_memcpy_async returns %d\n", Rc); return Rc; @@ -410,7 +510,7 @@ EXTERN int omp_target_memcpy_rect_async( DstDimensions, SrcDimensions, DstDevice, SrcDevice); // Create and launch helper task - int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList); + int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList); DP("omp_target_memcpy_rect_async returns %d\n", Rc); return Rc; diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index cbce15b63a3eba..ec1ecd5503e14f 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -253,6 +253,19 @@ struct TargetMemcpyArgsTy { DstOffsets(DstOffsets), SrcOffsets(SrcOffsets), DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){}; }; + +struct TargetMemsetArgsTy { + /** + * Common attributes of a memset operation + */ + void *Ptr; + int C; + size_t N; + int DeviceNum; + + // no constructors defined, because this is a PoD +}; + // Invalid GTID as defined by libomp; keep in sync #define KMP_GTID_DNE (-2) #ifdef __cplusplus From 97dba336319777ee3c06ed76be2dc41ee87f01cf Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Wed, 4 Oct 2023 16:05:38 +0200 Subject: [PATCH 03/10] Refactor code a bit to reduce code duplication --- openmp/libomptarget/src/api.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index c8675b698544f3..baf5878595b123 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -259,6 +259,16 @@ static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { return OFFLOAD_SUCCESS; } +static inline +void ConvertDepObjVector(llvm::SmallVector &Vec, + int DepObjCount, omp_depend_t * DepObjList) { + for (int i = 0; i < DepObjCount; i++) { + omp_depend_t DepObj = DepObjList[i]; + Vec.push_back(*((kmp_depend_info_t *)DepObj)); + } +} + + static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, int DepObjCount, omp_depend_t * DepObjList) { @@ -284,10 +294,7 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, // Convert types of depend objects llvm::SmallVector DepObjs; - for (int i = 0; i < DepObjCount; i++) { - omp_depend_t DepObj = DepObjList[i]; - DepObjs.push_back(*((kmp_depend_info_t *)DepObj)); - } + ConvertDepObjVector(DepObjs, DepObjCount, DepObjList); // Launch the helper task int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount, @@ -381,10 +388,7 @@ static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args, // Convert the type of depend objects llvm::SmallVector DepObjs; - for (int i = 0; i < DepObjCount; i++) { - omp_depend_t DepObj = DepObjList[i]; - DepObjs.push_back(*((kmp_depend_info_t *)DepObj)); - } + ConvertDepObjVector(DepObjs, DepObjCount, DepObjList); // Launch the helper task int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount, From 33e7eb64f56a8e18eb6498a332b98fe8a89f70c2 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Fri, 6 Oct 2023 14:44:42 +0200 Subject: [PATCH 04/10] Apply formatting rules --- openmp/libomptarget/src/api.cpp | 39 +++++++++++++++------------------ 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index baf5878595b123..0b8a21652ee9b2 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -259,19 +259,18 @@ static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { return OFFLOAD_SUCCESS; } -static inline -void ConvertDepObjVector(llvm::SmallVector &Vec, - int DepObjCount, omp_depend_t * DepObjList) { - for (int i = 0; i < DepObjCount; i++) { +static inline void +ConvertDepObjVector(llvm::SmallVector &Vec, int DepObjCount, + omp_depend_t *DepObjList) { + for (int i = 0; i < DepObjCount; ++i) { omp_depend_t DepObj = DepObjList[i]; Vec.push_back(*((kmp_depend_info_t *)DepObj)); } } - static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, int DepObjCount, - omp_depend_t * DepObjList) { + omp_depend_t *DepObjList) { // Create global thread ID int Gtid = __kmpc_global_thread_num(nullptr); int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper; @@ -282,8 +281,8 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, InputFlags->hidden_helper = 1; // Alloc the helper task - kmp_task_t *Task = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags, - sizeof(kmp_task_t), 0, Fn, -1); + kmp_task_t *Task = __kmpc_omp_target_task_alloc( + nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1); if (!Task) { delete Args; return OFFLOAD_FAIL; @@ -303,7 +302,7 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, return Rc; } -EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) { +EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n", DeviceNum, Ptr, N); @@ -316,9 +315,8 @@ EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) { if (DeviceNum == omp_get_initial_device()) { DP("filling memory on host via memset"); - memset(Ptr, C, N); // ignore return value, memset() cannot fail - } - else { + memset(Ptr, C, N); // ignore return value, memset() cannot fail + } else { // TODO: replace the omp_target_memset() slow path with the fast path. // That will require the ability to execute a kernel from within // libomptarget.so (which we do not have at the moment). @@ -326,20 +324,19 @@ EXTERN void * omp_target_memset(void * Ptr, int C, size_t N, int DeviceNum) { // This is a very slow path: create a filled array on the host and upload // it to the GPU device. int InitialDevice = omp_get_initial_device(); - void * Shadow = omp_target_alloc(N, InitialDevice); - memset(Shadow, C, N); - omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice); - omp_target_free(Shadow, InitialDevice); + void *Shadow = omp_target_alloc(N, InitialDevice); + (void)memset(Shadow, C, N); + (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice); + (void)omp_target_free(Shadow, InitialDevice); } DP("omp_target_memset returns %p\n", Ptr); return Ptr; } - -EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum, - int DepObjCount, - omp_depend_t * DepObjList) { +EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum, + int DepObjCount, + omp_depend_t *DepObjList) { DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu", DeviceNum, Ptr, N); @@ -354,7 +351,7 @@ EXTERN void * omp_target_memset_async(void * Ptr, int C, size_t N, int DeviceNum // omp_target_memset_async() cannot fail via a return code, so ignore the // return code of the helper function - (void) libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList); + (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList); return Ptr; } From 7933bfa051acaecd801eb95bddcdbae8456d06c3 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Tue, 10 Oct 2023 17:16:01 +0200 Subject: [PATCH 05/10] (Partly) Address feedback by shiltian --- openmp/libomptarget/src/api.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 0b8a21652ee9b2..c8e74ee89bf142 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -242,9 +242,8 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { } static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { - if (!Task) { + if (!Task) return OFFLOAD_FAIL; - } auto *Args = reinterpret_cast(Task->shareds); if (!Args) { @@ -302,20 +301,20 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, return Rc; } -EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) { +EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n", - DeviceNum, Ptr, N); + DeviceNum, Ptr, NumBytes); // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation // of unspecified behavior, see OpenMP spec). - if (!Ptr || N == 0) { + if (!Ptr || NumBytes == 0) { return Ptr; } if (DeviceNum == omp_get_initial_device()) { DP("filling memory on host via memset"); - memset(Ptr, C, N); // ignore return value, memset() cannot fail + memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail } else { // TODO: replace the omp_target_memset() slow path with the fast path. // That will require the ability to execute a kernel from within @@ -324,9 +323,9 @@ EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) { // This is a very slow path: create a filled array on the host and upload // it to the GPU device. int InitialDevice = omp_get_initial_device(); - void *Shadow = omp_target_alloc(N, InitialDevice); - (void)memset(Shadow, C, N); - (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice); + void *Shadow = omp_target_alloc(NumBytes, InitialDevice); + (void)memset(Shadow, ByteVal, NumBytes); + (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, InitialDevice); (void)omp_target_free(Shadow, InitialDevice); } @@ -334,20 +333,20 @@ EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) { return Ptr; } -EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum, +EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum, int DepObjCount, omp_depend_t *DepObjList) { DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu", - DeviceNum, Ptr, N); + DeviceNum, Ptr, NumBytes); // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation // of unspecified behavior, see OpenMP spec). - if (!Ptr || N == 0) { + if (!Ptr || NumBytes == 0) { return Ptr; } // Create the task object to deal with the async invocation - auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum}; + auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum}; // omp_target_memset_async() cannot fail via a return code, so ignore the // return code of the helper function From 5e87a5708d4f7902a14893b7a5e5af38c311759b Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Tue, 10 Oct 2023 19:20:54 +0200 Subject: [PATCH 06/10] Slightly reduce code complexity, merge task helpers for memset/memcpy --- openmp/libomptarget/src/api.cpp | 72 +++++++++------------------------ 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index c8e74ee89bf142..ff51c73b58da16 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -210,7 +210,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length, } // The helper function that calls omp_target_memcpy or omp_target_memcpy_rect -static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { +static int libomp_target_memcpy_async_task(kmp_int32 Gtid, kmp_task_t *Task) { if (Task == nullptr) return OFFLOAD_FAIL; @@ -241,7 +241,7 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { return Rc; } -static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { +static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) { if (!Task) return OFFLOAD_FAIL; @@ -267,12 +267,12 @@ ConvertDepObjVector(llvm::SmallVector &Vec, int DepObjCount, } } -static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, - int DepObjCount, - omp_depend_t *DepObjList) { +template +static inline int +libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *), + int DepObjCount, omp_depend_t *DepObjList) { // Create global thread ID int Gtid = __kmpc_global_thread_num(nullptr); - int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper; // Setup the hidden helper flags kmp_int32 Flags = 0; @@ -301,7 +301,8 @@ static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args, return Rc; } -EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum) { +EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, + int DeviceNum) { TIMESCOPE(); DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n", DeviceNum, Ptr, NumBytes); @@ -325,7 +326,8 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int Devi int InitialDevice = omp_get_initial_device(); void *Shadow = omp_target_alloc(NumBytes, InitialDevice); (void)memset(Shadow, ByteVal, NumBytes); - (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, InitialDevice); + (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, + InitialDevice); (void)omp_target_free(Shadow, InitialDevice); } @@ -333,66 +335,28 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, int Devi return Ptr; } -EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes, int DeviceNum, - int DepObjCount, +EXTERN void *omp_target_memset_async(void *Ptr, int ByteVal, size_t NumBytes, + int DeviceNum, int DepObjCount, omp_depend_t *DepObjList) { DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu", DeviceNum, Ptr, NumBytes); // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation // of unspecified behavior, see OpenMP spec). - if (!Ptr || NumBytes == 0) { + if (!Ptr || NumBytes == 0) return Ptr; - } // Create the task object to deal with the async invocation auto *Args = new TargetMemsetArgsTy{Ptr, ByteVal, NumBytes, DeviceNum}; // omp_target_memset_async() cannot fail via a return code, so ignore the // return code of the helper function - (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList); + (void)libomp_helper_task_creation(Args, &libomp_target_memset_async_task, + DepObjCount, DepObjList); return Ptr; } -// Allocate and launch helper task -static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args, - int DepObjCount, - omp_depend_t *DepObjList) { - // Create global thread ID - int Gtid = __kmpc_global_thread_num(nullptr); - int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper; - - // Setup the hidden helper flags; - kmp_int32 Flags = 0; - kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags; - InputFlags->hidden_helper = 1; - - // Alloc helper task - kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags, - sizeof(kmp_task_t), 0, Fn, -1); - - if (Ptr == nullptr) { - // Task allocation failed, delete the argument object - delete Args; - - return OFFLOAD_FAIL; - } - - // Setup the arguments passed to helper task - Ptr->shareds = Args; - - // Convert the type of depend objects - llvm::SmallVector DepObjs; - ConvertDepObjVector(DepObjs, DepObjCount, DepObjList); - - // Launch the helper task - int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount, - DepObjs.data(), 0, nullptr); - - return Rc; -} - EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice, @@ -413,7 +377,8 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice); // Create and launch helper task - int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList); + int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task, + DepObjCount, DepObjList); DP("omp_target_memcpy_async returns %d\n", Rc); return Rc; @@ -510,7 +475,8 @@ EXTERN int omp_target_memcpy_rect_async( DstDimensions, SrcDimensions, DstDevice, SrcDevice); // Create and launch helper task - int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList); + int Rc = libomp_helper_task_creation(Args, &libomp_target_memcpy_async_task, + DepObjCount, DepObjList); DP("omp_target_memcpy_rect_async returns %d\n", Rc); return Rc; From 012952ada089b623a7d361a00f74f787481ee9b0 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Wed, 11 Oct 2023 16:43:57 +0200 Subject: [PATCH 07/10] Add a safety net against catastrophic failures of omp_target_memset --- openmp/libomptarget/src/api.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index ff51c73b58da16..0e989375cbe45b 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -325,10 +325,18 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, // it to the GPU device. int InitialDevice = omp_get_initial_device(); void *Shadow = omp_target_alloc(NumBytes, InitialDevice); - (void)memset(Shadow, ByteVal, NumBytes); - (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, - InitialDevice); - (void)omp_target_free(Shadow, InitialDevice); + if (Shadow) { + (void)memset(Shadow, ByteVal, NumBytes); + (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, + InitialDevice); + (void)omp_target_free(Shadow, InitialDevice); + } else { + // If the omp_target_alloc has failed, let's just not do anything. + // omp_target_memset does not have any good way to fail, so we + // simply avoid a catastrophic failure of the process for now. + DP("omp_target_memset failed to fill memory due to error with " + "omp_target_alloc"); + } } DP("omp_target_memset returns %p\n", Ptr); From 33475f49134dc43276175d77e81f3ab72817951f Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Wed, 18 Oct 2023 11:50:28 +0200 Subject: [PATCH 08/10] Improve code formatting --- openmp/libomptarget/src/api.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 0e989375cbe45b..6d5a91f1d262fe 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -246,9 +246,8 @@ static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) { return OFFLOAD_FAIL; auto *Args = reinterpret_cast(Task->shareds); - if (!Args) { + if (!Args) return OFFLOAD_FAIL; - } // call omp_target_memset() omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum); From f2fd2802cd1cc876aff0c55c1e743ece3e382ecd Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Thu, 19 Oct 2023 01:20:13 +0200 Subject: [PATCH 09/10] Fix style --- openmp/libomptarget/src/private.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index ec1ecd5503e14f..cf88f78696b2e6 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -255,9 +255,7 @@ struct TargetMemcpyArgsTy { }; struct TargetMemsetArgsTy { - /** - * Common attributes of a memset operation - */ + // Common attributes of a memset operation void *Ptr; int C; size_t N; From d63e703ce34a17bee5fbc2897a32fcc26e1c2676 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Thu, 19 Oct 2023 15:04:23 +0200 Subject: [PATCH 10/10] Fix last style issue with the new code. Hopefully :-) --- openmp/libomptarget/src/api.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 6d5a91f1d262fe..ecef02c8a0d3d0 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -258,7 +258,7 @@ static int libomp_target_memset_async_task(kmp_int32 Gtid, kmp_task_t *Task) { } static inline void -ConvertDepObjVector(llvm::SmallVector &Vec, int DepObjCount, +convertDepObjVector(llvm::SmallVector &Vec, int DepObjCount, omp_depend_t *DepObjList) { for (int i = 0; i < DepObjCount; ++i) { omp_depend_t DepObj = DepObjList[i]; @@ -291,7 +291,7 @@ libomp_helper_task_creation(T *Args, int (*Fn)(kmp_int32, kmp_task_t *), // Convert types of depend objects llvm::SmallVector DepObjs; - ConvertDepObjVector(DepObjs, DepObjCount, DepObjList); + convertDepObjVector(DepObjs, DepObjCount, DepObjList); // Launch the helper task int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,