From ba80b5ee426e7ca6603ae92fe442716730e32544 Mon Sep 17 00:00:00 2001 From: Siva Rama Krishna Reddy B Date: Fri, 16 Dec 2022 14:11:47 +0530 Subject: [PATCH 1/2] [RUNTIME][OPENCL] OpenCL host pointer support to acheive zero copy OpenCL supports device memory access to host by memory mapping. OpenCL flag "CL_MEM_ALLOC_HOST_PTR" enable this while creating a memory object. We enable this feature via compilation setting "USE_OPENCL_ENABLE_HOST_PTR" followed by a new API "GetNativePtr" on OpenCLWorkSpace. This allows application directly use hardware allocated memory while preparing the input. From user side we allocate NDArray which same size as graph input, access native memory and finally call set_input_zero_copy to set the input. Psudo code looks like auto narr = tvm::runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0}); OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); void *nptr = workspace->GetNativePtr(narr); ... access memory pointed by nptr up to the tensor size ... tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input_zero_copy"); set_input(i, narr); --- CMakeLists.txt | 1 + cmake/config.cmake | 5 +++ cmake/modules/LibInfo.cmake | 1 + cmake/modules/OpenCL.cmake | 3 ++ cmake/modules/contrib/CLML.cmake | 4 ++- src/runtime/opencl/opencl_common.h | 5 +++ src/runtime/opencl/opencl_device_api.cc | 32 +++++++++++++++-- .../opencl/opencl_wrapper/opencl_wrapper.cc | 31 ++++++++++++++++ src/support/libinfo.cc | 5 +++ tests/cpp-runtime/opencl/opencl_nativeptr.cc | 36 +++++++++++++++++++ 10 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 tests/cpp-runtime/opencl/opencl_nativeptr.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 119bf8325c8c..ab9cbb8e9c66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ endif() # Alernatively, use cmake -DOPTION=VALUE through command-line. tvm_option(USE_CUDA "Build with CUDA" OFF) tvm_option(USE_OPENCL "Build with OpenCL" OFF) +tvm_option(USE_OPENCL_ENABLE_HOST_PTR "Enable OpenCL memory object access to host" OFF) tvm_option(USE_OPENCL_GTEST "Path to OpenCL specific gtest version for runtime cpp tests." /path/to/opencl/gtest) tvm_option(USE_VULKAN "Build with Vulkan" OFF) diff --git a/cmake/config.cmake b/cmake/config.cmake index 679f5c459e87..e26f909ea277 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -71,6 +71,11 @@ set(USE_AOCL OFF) # - /path/to/opencl-sdk: use specific path to opencl-sdk set(USE_OPENCL OFF) +# Wheather to allow OPENCL cl_mem access to host +# cl_mem will be allocated with CL_MEM_ALLOC_HOST_PTR +# OpenCLWorkspace->GetHostPtr API returns the host accessible pointer +set(USE_OPENCL_ENABLE_HOST_PTR OFF) + # Whether enable Metal runtime set(USE_METAL OFF) diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake index 7c24088c0ad2..042fa3c6ddd7 100644 --- a/cmake/modules/LibInfo.cmake +++ b/cmake/modules/LibInfo.cmake @@ -89,6 +89,7 @@ function(add_lib_info src_file) TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}" TVM_INFO_USE_NNPACK="${USE_NNPACK}" TVM_INFO_USE_OPENCL="${USE_OPENCL}" + TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR="${USE_OPENCL_ENABLE_HOST_PTR}" TVM_INFO_USE_OPENCL_GTEST="${USE_OPENCL_GTEST}" TVM_INFO_USE_OPENMP="${USE_OPENMP}" TVM_INFO_USE_PAPI="${USE_PAPI}" diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake index 1e1041efe386..ced2da2d17e3 100644 --- a/cmake/modules/OpenCL.cmake +++ b/cmake/modules/OpenCL.cmake @@ -74,6 +74,9 @@ if(USE_OPENCL) target_link_libraries(opencl-cpptest PRIVATE gtest_main tvm_runtime) endif() list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS}) + if(USE_OPENCL_ENABLE_HOST_PTR) + add_definitions(-DOPENCL_ENABLE_HOST_PTR) + endif(USE_OPENCL_ENABLE_HOST_PTR) else() list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc) endif(USE_OPENCL) diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake index 30e60423b03b..e86a7e1ae032 100644 --- a/cmake/modules/contrib/CLML.cmake +++ b/cmake/modules/contrib/CLML.cmake @@ -54,5 +54,7 @@ if(USE_CLML_GRAPH_EXECUTOR) file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS}) set(USE_OPENCL ON) - + if(USE_OPENCL_ENABLE_HOST_PTR) + add_definitions(-DOPENCL_ENABLE_HOST_PTR) + endif(USE_OPENCL_ENABLE_HOST_PTR) endif() diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index f0a68864d724..ba321cd5fb33 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -212,6 +212,7 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) { } class OpenCLThreadEntry; +struct BufferDescriptor; /*! * \brief Process global OpenCL workspace. @@ -290,6 +291,7 @@ class OpenCLWorkspace : public DeviceAPI { void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final; void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope = NullOpt) final; + void* GetNativePtr(tvm::runtime::NDArray &narr); void FreeDataSpace(Device dev, void* ptr) final; void StreamSync(Device dev, TVMStreamHandle stream) final; void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; @@ -310,6 +312,8 @@ class OpenCLWorkspace : public DeviceAPI { void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final; + void* CreateHostPtrIfEnabled(BufferDescriptor *desc, Device dev, size_t size); + private: std::string GetError() { if (this->devices.size() == 0) return noDevicesErrorMsg; @@ -377,6 +381,7 @@ struct BufferDescriptor { static String ScopeFromMemoryLayout(MemoryLayout mem_scope); cl_mem buffer{nullptr}; + cl_uchar* host_ptr{nullptr}; MemoryLayout layout{MemoryLayout::kBuffer1D}; }; } // namespace cl diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 1244fddf0983..8c9e07f3155d 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -29,6 +29,12 @@ #include "opencl_common.h" +#ifdef OPENCL_ENABLE_HOST_PTR +#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR +#else +#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE +#endif + namespace tvm { namespace runtime { namespace cl { @@ -191,6 +197,17 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) } } +void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device dev, size_t size) { +#if defined(OPENCL_ENABLE_HOST_PTR) + cl_int err_code; + desc->host_ptr = reinterpret_cast( + clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0, + sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code)); + OPENCL_CHECK_ERROR(err_code); +#endif // OPENCL_ENABLE_HOST_PTR + return desc; +} + void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) { this->Init(); @@ -201,10 +218,10 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, if (size == 0) { size = 1; } - desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code); + desc->buffer = clCreateBuffer(this->context, CL_MEM_CREATE_FLAGS, size, nullptr, &err_code); desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D; OPENCL_CHECK_ERROR(err_code); - return desc; + return CreateHostPtrIfEnabled(desc, dev, size); } void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, @@ -226,12 +243,21 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape return desc; } +void* OpenCLWorkspace::GetNativePtr(tvm::runtime::NDArray &narr) { + cl::BufferDescriptor* desc = static_cast(narr.operator->()->data); + return desc->host_ptr; +} + void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { // We have to make sure that the memory object is not in the command queue // for some OpenCL platforms. OPENCL_CALL(clFinish(this->GetQueue(dev))); cl::BufferDescriptor* desc = static_cast(ptr); + if (desc->host_ptr) { + clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, + reinterpret_cast(desc->host_ptr), 0, NULL, NULL); + } OPENCL_CALL(clReleaseMemObject(desc->buffer)); delete desc; } @@ -245,7 +271,7 @@ cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height, cl_image_format format = {CL_RGBA, cl_type}; cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0}; cl_mem mptr = - clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code); + clCreateImage(this->context, CL_MEM_CREATE_FLAGS, &format, &descriptor, nullptr, &err_code); OPENCL_CHECK_ERROR(err_code); return mptr; } diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc index c447ebcb5339..2c2768945424 100644 --- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc +++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc @@ -173,6 +173,11 @@ using f_clEnqueueNDRangeKernel = cl_int (*)(cl_command_queue, cl_kernel, cl_uint cl_event*); using f_clCreateCommandQueue = cl_command_queue (*)(cl_context, cl_device_id, cl_command_queue_properties, cl_int*); +using f_clEnqueueUnmapMemObject = cl_int (*)(cl_command_queue, cl_mem, void*, cl_uint, + const cl_event*, cl_event*); +using f_clEnqueueMapBuffer = void* (*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, + size_t, cl_uint, const cl_event*, cl_event*, cl_int*); + } // namespace cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms) { @@ -572,3 +577,29 @@ cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device, return nullptr; } } + +cl_int clEnqueueUnmapMemObject(cl_command_queue queue, cl_mem memobj, void* mapped_ptr, + cl_uint num_events_in_wait_list, const cl_event* event_wait_list, + cl_event* event) { + auto& lib = LibOpenCLWrapper::getInstance(); + auto func = (f_clEnqueueUnmapMemObject)lib.getOpenCLFunction("clEnqueueUnmapMemObject"); + if (func) { + return func(queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } +} + +void* clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, + cl_map_flags map_flags, size_t offset, size_t cb, + cl_uint num_events_in_wait_list, const cl_event* event_wait_list, + cl_event* event, cl_int* errcode_ret) { + auto& lib = LibOpenCLWrapper::getInstance(); + auto func = (f_clEnqueueMapBuffer)lib.getOpenCLFunction("clEnqueueMapBuffer"); + if (func) { + return func(command_queue, buffer, blocking_map, map_flags, offset, cb, num_events_in_wait_list, + event_wait_list, event, errcode_ret); + } else { + return nullptr; + } +} diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc index c0fc9881b4f5..7bb1e04920fa 100644 --- a/src/support/libinfo.cc +++ b/src/support/libinfo.cc @@ -43,6 +43,10 @@ #define TVM_INFO_USE_OPENCL "NOT-FOUND" #endif +#ifndef TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR +#define TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR "NOT-FOUND" +#endif + #ifndef TVM_INFO_USE_OPENCL_GTEST #define TVM_INFO_USE_OPENCL_GTEST "NOT-FOUND" #endif @@ -294,6 +298,7 @@ TVM_DLL Map GetLibInfo() { {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT}, {"USE_NNPACK", TVM_INFO_USE_NNPACK}, {"USE_OPENCL", TVM_INFO_USE_OPENCL}, + {"USE_OPENCL_ENABLE_HOST_PTR", TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR}, {"USE_OPENCL_GTEST", TVM_INFO_USE_OPENCL_GTEST}, {"USE_OPENMP", TVM_INFO_USE_OPENMP}, {"USE_PAPI", TVM_INFO_USE_PAPI}, diff --git a/tests/cpp-runtime/opencl/opencl_nativeptr.cc b/tests/cpp-runtime/opencl/opencl_nativeptr.cc new file mode 100644 index 000000000000..f96cd820937c --- /dev/null +++ b/tests/cpp-runtime/opencl/opencl_nativeptr.cc @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "../src/runtime/opencl/opencl_common.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::cl; + +#if defined(OPENCL_ENABLE_HOST_PTR) +TEST(OpenCLNDArray, native_ptr) { + OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); + + auto A = tvm::runtime::NDArray::Empty({128, 128}, {kDLFloat, 32, 1}, {kDLOpenCL, 0}); + void *nptr = workspace->GetNativePtr(A); + memset(nptr, 0x0, 128*128*4); +} +#endif From 901b189df336944a7598071ef8ffdb7775c61ec3 Mon Sep 17 00:00:00 2001 From: Siva Rama Krishna Reddy B Date: Mon, 19 Dec 2022 10:57:26 +0530 Subject: [PATCH 2/2] * Lint error. --- src/runtime/opencl/opencl_common.h | 4 ++-- src/runtime/opencl/opencl_device_api.cc | 2 +- tests/cpp-runtime/opencl/opencl_nativeptr.cc | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index ba321cd5fb33..7bbb358f8f92 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -291,7 +291,7 @@ class OpenCLWorkspace : public DeviceAPI { void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final; void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, Optional mem_scope = NullOpt) final; - void* GetNativePtr(tvm::runtime::NDArray &narr); + void* GetNativePtr(const tvm::runtime::NDArray& narr); void FreeDataSpace(Device dev, void* ptr) final; void StreamSync(Device dev, TVMStreamHandle stream) final; void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final; @@ -312,7 +312,7 @@ class OpenCLWorkspace : public DeviceAPI { void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final; - void* CreateHostPtrIfEnabled(BufferDescriptor *desc, Device dev, size_t size); + void* CreateHostPtrIfEnabled(BufferDescriptor* desc, Device dev, size_t size); private: std::string GetError() { diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 8c9e07f3155d..aa31d80d6e8b 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -243,7 +243,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape return desc; } -void* OpenCLWorkspace::GetNativePtr(tvm::runtime::NDArray &narr) { +void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) { cl::BufferDescriptor* desc = static_cast(narr.operator->()->data); return desc->host_ptr; } diff --git a/tests/cpp-runtime/opencl/opencl_nativeptr.cc b/tests/cpp-runtime/opencl/opencl_nativeptr.cc index f96cd820937c..ebfb62e92069 100644 --- a/tests/cpp-runtime/opencl/opencl_nativeptr.cc +++ b/tests/cpp-runtime/opencl/opencl_nativeptr.cc @@ -30,7 +30,7 @@ TEST(OpenCLNDArray, native_ptr) { OpenCLWorkspace* workspace = OpenCLWorkspace::Global(); auto A = tvm::runtime::NDArray::Empty({128, 128}, {kDLFloat, 32, 1}, {kDLOpenCL, 0}); - void *nptr = workspace->GetNativePtr(A); - memset(nptr, 0x0, 128*128*4); + void* nptr = workspace->GetNativePtr(A); + memset(nptr, 0x0, 128 * 128 * 4); } #endif