-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libomptarget][OpenMP] Initial implementation of omp_target_memset() and omp_target_memset_async() #68706
Conversation
There is a TODO to implement a fast path that uses an on-device kernel instead of the host-based memory fill operation. This may require some additional plumbing to have kernels in libomptarget.so
@llvm/pr-subscribers-openmp Author: Michael Klemm (mjklemm) ChangesThis PR adds the newly landed OpenMP API routines omp_target_memset() and omp_target_memset_sync() to fill memory in the device memory. This first implementation is based on a slow path that initializes memory on the host and then issues a H2D transfer to fill the memory on the target device. A future PR will introduce an offloaded kernel (or driver-side API to do fill the memory on the target device) Full diff: https://github.com/llvm/llvm-project/pull/68706.diff 10 Files Affected:
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index f87557a69eff272..e1f0f77849fa206 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -312,6 +312,7 @@ int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstDimensions,
const size_t *SrcDimensions, int DstDevice,
int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
size_t Size, size_t DeviceOffset, int DeviceNum);
int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
index 942df8fdb94d660..0b8a21652ee9b24 100644
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -241,10 +241,125 @@ static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
return Rc;
}
+static int libomp_target_memset_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+ if (!Task) {
+ return OFFLOAD_FAIL;
+ }
+
+ auto *Args = reinterpret_cast<TargetMemsetArgsTy *>(Task->shareds);
+ if (!Args) {
+ return OFFLOAD_FAIL;
+ }
+
+ // call omp_target_memset()
+ omp_target_memset(Args->Ptr, Args->C, Args->N, Args->DeviceNum);
+
+ delete Args;
+
+ return OFFLOAD_SUCCESS;
+}
+
+static inline void
+ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount,
+ omp_depend_t *DepObjList) {
+ for (int i = 0; i < DepObjCount; ++i) {
+ omp_depend_t DepObj = DepObjList[i];
+ Vec.push_back(*((kmp_depend_info_t *)DepObj));
+ }
+}
+
+static int libomp_helper_memset_task_creation(TargetMemsetArgsTy *Args,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
+ // Create global thread ID
+ int Gtid = __kmpc_global_thread_num(nullptr);
+ int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memset_async_helper;
+
+ // Setup the hidden helper flags
+ kmp_int32 Flags = 0;
+ kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+ InputFlags->hidden_helper = 1;
+
+ // Alloc the helper task
+ kmp_task_t *Task = __kmpc_omp_target_task_alloc(
+ nullptr, Gtid, Flags, sizeof(kmp_task_t), 0, Fn, -1);
+ if (!Task) {
+ delete Args;
+ return OFFLOAD_FAIL;
+ }
+
+ // Setup the arguments for the helper task
+ Task->shareds = Args;
+
+ // Convert types of depend objects
+ llvm::SmallVector<kmp_depend_info_t> DepObjs;
+ ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
+
+ // Launch the helper task
+ int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Task, DepObjCount,
+ DepObjs.data(), 0, nullptr);
+
+ return Rc;
+}
+
+EXTERN void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum) {
+ TIMESCOPE();
+ DP("Call to omp_target_memset, device %d, device pointer %p, size %zu\n",
+ DeviceNum, Ptr, N);
+
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+ // of unspecified behavior, see OpenMP spec).
+ if (!Ptr || N == 0) {
+ return Ptr;
+ }
+
+ if (DeviceNum == omp_get_initial_device()) {
+ DP("filling memory on host via memset");
+ memset(Ptr, C, N); // ignore return value, memset() cannot fail
+ } else {
+ // TODO: replace the omp_target_memset() slow path with the fast path.
+ // That will require the ability to execute a kernel from within
+ // libomptarget.so (which we do not have at the moment).
+
+ // This is a very slow path: create a filled array on the host and upload
+ // it to the GPU device.
+ int InitialDevice = omp_get_initial_device();
+ void *Shadow = omp_target_alloc(N, InitialDevice);
+ (void)memset(Shadow, C, N);
+ (void)omp_target_memcpy(Ptr, Shadow, N, 0, 0, DeviceNum, InitialDevice);
+ (void)omp_target_free(Shadow, InitialDevice);
+ }
+
+ DP("omp_target_memset returns %p\n", Ptr);
+ return Ptr;
+}
+
+EXTERN void *omp_target_memset_async(void *Ptr, int C, size_t N, int DeviceNum,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
+ DP("Call to omp_target_memset_async, device %d, device pointer %p, size %zu",
+ DeviceNum, Ptr, N);
+
+ // Behave as a no-op if N==0 or if Ptr is nullptr (as a useful implementation
+ // of unspecified behavior, see OpenMP spec).
+ if (!Ptr || N == 0) {
+ return Ptr;
+ }
+
+ // Create the task object to deal with the async invocation
+ auto *Args = new TargetMemsetArgsTy{Ptr, C, N, DeviceNum};
+
+ // omp_target_memset_async() cannot fail via a return code, so ignore the
+ // return code of the helper function
+ (void)libomp_helper_memset_task_creation(Args, DepObjCount, DepObjList);
+
+ return Ptr;
+}
+
// Allocate and launch helper task
-static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
- int DepObjCount,
- omp_depend_t *DepObjList) {
+static int libomp_helper_memcpy_task_creation(TargetMemcpyArgsTy *Args,
+ int DepObjCount,
+ omp_depend_t *DepObjList) {
// Create global thread ID
int Gtid = __kmpc_global_thread_num(nullptr);
int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
@@ -270,10 +385,7 @@ static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
// Convert the type of depend objects
llvm::SmallVector<kmp_depend_info_t> DepObjs;
- for (int i = 0; i < DepObjCount; i++) {
- omp_depend_t DepObj = DepObjList[i];
- DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
- }
+ ConvertDepObjVector(DepObjs, DepObjCount, DepObjList);
// Launch the helper task
int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
@@ -302,7 +414,7 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
// Create and launch helper task
- int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+ int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
DP("omp_target_memcpy_async returns %d\n", Rc);
return Rc;
@@ -399,7 +511,7 @@ EXTERN int omp_target_memcpy_rect_async(
DstDimensions, SrcDimensions, DstDevice, SrcDevice);
// Create and launch helper task
- int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+ int Rc = libomp_helper_memcpy_task_creation(Args, DepObjCount, DepObjList);
DP("omp_target_memcpy_rect_async returns %d\n", Rc);
return Rc;
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index c29c8d03fb1276f..af882a264264725 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -44,6 +44,8 @@ VERS1.0 {
omp_target_memcpy_rect;
omp_target_memcpy_async;
omp_target_memcpy_rect_async;
+ omp_target_memset;
+ omp_target_memset_async;
omp_target_associate_ptr;
omp_target_disassociate_ptr;
llvm_omp_target_alloc_host;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cbce15b63a3eba2..ec1ecd5503e14fb 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -253,6 +253,19 @@ struct TargetMemcpyArgsTy {
DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
};
+
+struct TargetMemsetArgsTy {
+ /**
+ * Common attributes of a memset operation
+ */
+ void *Ptr;
+ int C;
+ size_t N;
+ int DeviceNum;
+
+ // no constructors defined, because this is a PoD
+};
+
// Invalid GTID as defined by libomp; keep in sync
#define KMP_GTID_DNE (-2)
#ifdef __cplusplus
diff --git a/openmp/libomptarget/test/api/omp_target_memset.c b/openmp/libomptarget/test/api/omp_target_memset.c
new file mode 100644
index 000000000000000..ea0c6d73fbdeefb
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memset.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+ int d = omp_get_default_device();
+ int id = omp_get_initial_device();
+ int q[128], i;
+ void *p;
+ void *result;
+
+ if (d < 0 || d >= omp_get_num_devices())
+ d = id;
+
+ p = omp_target_alloc(130 * sizeof(int), d);
+ if (p == NULL)
+ return 0;
+
+ for (i = 0; i < 128; i++)
+ q[i] = i;
+
+ result = omp_target_memset(p, 0, 130 * sizeof(int), d);
+ if (result != p) {
+ abort();
+ }
+
+ int q2[128];
+ for (i = 0; i < 128; ++i)
+ q2[i] = i;
+ if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
+ 0, NULL))
+ abort();
+
+#pragma omp taskwait
+
+ for (i = 0; i < 128; ++i)
+ if (q2[i] != 0)
+ abort();
+
+ omp_target_free(p, d);
+
+ return 0;
+}
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index e69cf6670e81489..0d49643709e0a05 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -518,6 +518,8 @@ kmp_set_warnings_off 780
omp_target_memcpy_rect 887
omp_target_associate_ptr 888
omp_target_disassociate_ptr 889
+ omp_target_memset 3000
+ omp_target_memset_async 3001
%endif
kmp_set_disp_num_buffers 890
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 1b2c467a2a12d8a..f372402be37d39c 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -236,6 +236,11 @@
extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
const size_t *, const size_t *, const size_t *, const size_t *, int, int,
int, omp_depend_t *);
+
+ /* OpenMP 6.0 device memory routines */
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
/*!
* The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
*/
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index c72287422809aef..1ca542db3767ef2 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -635,6 +635,28 @@
integer (omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
use omp_lib_kinds
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 9f5e58515e75159..d20aade6ef8b327 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -732,6 +732,28 @@
integer(omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
& device_offset, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index d37c9c86028eb13..7d595b947f4a9de 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -116,6 +116,8 @@
#define FTN_TARGET_IS_PRESENT omp_target_is_present
#define FTN_TARGET_MEMCPY omp_target_memcpy
#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
#endif
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The rest of the patch looks good to me, though I'm not sure why I still see some old version of the code.
I have pushed a the latest patch set now. Does it look better now? |
If you feel that this is good to go, please merge (I cannot, as I have not yet earned the power to do so) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one nit
openmp/libomptarget/src/api.cpp
Outdated
} | ||
|
||
static inline void | ||
ConvertDepObjVector(llvm::SmallVector<kmp_depend_info_t> &Vec, int DepObjCount, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this function name be lower case?
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
As mentioned in llvm#68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly.
This PR adds the newly landed OpenMP API routines omp_target_memset() and omp_target_memset_sync() to fill memory in the device memory. This first implementation is based on a slow path that initializes memory on the host and then issues a H2D transfer to fill the memory on the target device. A future PR will introduce an offloaded kernel (or driver-side API to do fill the memory on the target device)