From 9f1959d3fd82fa6e5cdd1cd062bcf621905e2eea Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Fri, 27 Jan 2023 11:02:54 +0300 Subject: [PATCH 1/6] [OpenCL] Implement save/load pre-compiled programs Using pre-compiled programs might significantly improve inference time of the first run. - Added methods `SupportPreCompiledPrograms` which reports if the module supports using pre-compiled programs. - Method `GetPreCompiledPrograms` returns string with bytes of pre-compiled programs. - Method `SetPreCompiledPrograms` allows user to pass pre-compiled programs to the module. --- apps/cpp_rtvm/README.md | 12 + apps/cpp_rtvm/main.cc | 1 + apps/cpp_rtvm/tvm_runner.cc | 37 +++- apps/cpp_rtvm/tvm_runner.h | 4 + include/tvm/runtime/module.h | 21 ++ src/runtime/library_module.cc | 6 + src/runtime/module.cc | 8 + src/runtime/opencl/opencl_common.h | 3 + src/runtime/opencl/opencl_device_api.cc | 4 +- src/runtime/opencl/opencl_module.cc | 69 ++++++ .../opencl/opencl_wrapper/opencl_wrapper.cc | 12 + .../opencl/opencl_compile_to_bin.cc | 205 ++++++++++++++++++ 12 files changed, 379 insertions(+), 3 deletions(-) create mode 100644 tests/cpp-runtime/opencl/opencl_compile_to_bin.cc diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md index e6961532823d..db58388b2717 100644 --- a/apps/cpp_rtvm/README.md +++ b/apps/cpp_rtvm/README.md @@ -352,3 +352,15 @@ python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchai python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time ``` + +# Use pre-compiled OpenCL kernels +Using pre-compiled programs might significantly improve inference time of the +first run. E.g. for topology with ~300 kernels compilation time on Adreno was +about 26 seconds. But after dumping compiled programs to binary files and reuse +them on the next runs, the compilation time was significantly decreased (more +than 1000 times) and starts to be around 25 ms. + +To use such functionality, the developer have to set directory where the +pre-compiled programs will be stored. To the `rtvm` application such example was +added. After method `Load`, method `UsePreCompiledProgram` is called. This +method passes directory where the pre-compiled kernels should be stored. diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc index 31019ee0c9cf..332bfee7c5b6 100644 --- a/apps/cpp_rtvm/main.cc +++ b/apps/cpp_rtvm/main.cc @@ -190,6 +190,7 @@ int ExecuteModel(ToolArgs& args) { // Load the model runner.Load(); + runner.UsePreCompiledPrograms("pre_compiled"); // Query Model meta Information TVMMetaInfo mInfo = runner.GetMetaInfo(); diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index 74498e8170c4..dd89244e65d1 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -26,7 +26,9 @@ #include +#include #include +#include #include #include @@ -67,7 +69,8 @@ int GetTVMDevice(std::string device) { * \param path where the tfm compiler artifacts present. * \param device the target device where we need to load the compiled model. */ -TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) { +TVMRunner::TVMRunner(std::string path, std::string device) + : r_model_path(path), r_device(device), r_run_was_called(false) { LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device; } @@ -110,6 +113,37 @@ int TVMRunner::Load(void) { return 0; } +/*! + * \brief Specify if the run programs should be dumped to binary and reused in the next runs. + * \param pathToDir Path to the existed directory where pre-compiled programs should be stored. + */ +void TVMRunner::UsePreCompiledPrograms(std::string pathToDir) { + if (r_run_was_called) { + LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run"; + return; + } + if (!std::filesystem::exists(pathToDir)) + ICHECK(std::filesystem::create_directories(pathToDir) == true); + std::filesystem::path binary_path = pathToDir; + for (tvm::runtime::Module mod : r_mod_handle->imports()) { + if (mod->SupportPreCompiledPrograms()) { + std::string file_name = "pre_compiled_"; + file_name += mod->type_key(); + file_name += ".bin"; + auto file_path = binary_path / file_name; + if (!std::filesystem::exists(file_path)) { + auto bytes = mod->GetPreCompiledPrograms(); + std::ofstream fs(file_path.string(), std::ofstream::binary); + fs.write(bytes.c_str(), bytes.size()); + } else { + std::ifstream ifs(file_path.string(), std::ios::in | std::ios::binary); + std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + mod->SetPreCompiledPrograms(bytes); + } + } + } +} + /*! * \brief Calculated the memory size for the NDArray. * \param NDArray object. @@ -242,6 +276,7 @@ int TVMRunner::GetOutput(std::string output_id, char* raw_output) { */ int TVMRunner::Run(void) { LOG(INFO) << "TVMRunner::Run"; + r_run_was_called = true; r_graph_handle.GetFunction("run")(); return 0; diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h index 37ba53606ee8..926e009c4c2e 100644 --- a/apps/cpp_rtvm/tvm_runner.h +++ b/apps/cpp_rtvm/tvm_runner.h @@ -56,6 +56,8 @@ class TVMRunner { /*! \brief Initiates graph runtime and with the compiled model */ int Load(void); + /*! \brief Specify if the run programs should be dumped to binary and reused in the next runs */ + void UsePreCompiledPrograms(std::string); /*! \brief Executes one inference cycle */ int Run(void); /*! \brief To set the inputs from given npz file */ @@ -86,6 +88,8 @@ class TVMRunner { std::string r_device; /*! \brief Holds meta information queried from graph runtime */ TVMMetaInfo mInfo; + /*! \brief Mark if the run method was called */ + bool r_run_was_called; }; } // namespace runtime diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index a54f98a558f3..ba1a740c7a06 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -192,6 +192,27 @@ class TVM_DLL ModuleNode : public Object { /*! \return The module it imports from */ const std::vector& imports() const { return imports_; } + /*! + * \brief Returns true if this module supports building from pre-compiled programs. + * + * The default implementation returns false. + */ + virtual bool SupportPreCompiledPrograms() const { return false; } + + /*! + * \brief Pass pre-compiled programs which module will use to speed up compilation time. + * \param bytes string with bytes of pre-compiled programs. + */ + virtual void SetPreCompiledPrograms(const std::string& bytes); + + /*! + * \brief Get a pre-compiled programs for a module. + * If there are no compiled programs then first the programs will be compiled. + * + * \return The string with bytes of pre-compiled programs. + */ + virtual std::string GetPreCompiledPrograms(); + /*! * \brief Returns true if this module is 'DSO exportable'. * diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc index 54fd362387c5..042fc637c627 100644 --- a/src/runtime/library_module.cc +++ b/src/runtime/library_module.cc @@ -195,6 +195,12 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr lib, // The module order is collected via DFS *root_module = modules[0]; } + // Add all other modules to the import of the root_module + for (size_t i = 1; i < modules.size(); ++i) { + std::string tkey = modules[i]->type_key(); + if (tkey != "_lib" && tkey != "_import_tree") + root_module->Import(modules[i]); + } } Module CreateModuleFromLibrary(ObjectPtr lib, PackedFuncWrapper packed_func_wrapper) { diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 9ef57e905324..1f6a7347e2f1 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -128,6 +128,14 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) { } } +void ModuleNode::SetPreCompiledPrograms(const std::string& bytes) { + LOG(FATAL) << "Module[" << type_key() << "] does not support using pre-compiled programs"; +} + +std::string ModuleNode::GetPreCompiledPrograms() { + LOG(FATAL) << "Module[" << type_key() << "] does not support using pre-compiled programs"; +} + std::string ModuleNode::GetFormat() { LOG(FATAL) << "Module[" << type_key() << "] does not support GetFormat"; } diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index c172a0f94539..aad355d343a7 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -438,6 +438,9 @@ class OpenCLModuleNode : public ModuleNode { // install a new kernel to thread local entry cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e); + bool SupportPreCompiledPrograms() const final { return true; } + void SetPreCompiledPrograms(const std::string& bytes) final; + std::string GetPreCompiledPrograms() final; private: // The workspace, need to keep reference to use it in destructor. diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index aa31d80d6e8b..c53523267d66 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -202,7 +202,7 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device cl_int err_code; desc->host_ptr = reinterpret_cast( clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0, - sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code)); + sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code)); OPENCL_CHECK_ERROR(err_code); #endif // OPENCL_ENABLE_HOST_PTR return desc; @@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) { cl::BufferDescriptor* desc = static_cast(ptr); if (desc->host_ptr) { clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer, - reinterpret_cast(desc->host_ptr), 0, NULL, NULL); + reinterpret_cast(desc->host_ptr), 0, nullptr, nullptr); } OPENCL_CALL(clReleaseMemObject(desc->buffer)); delete desc; diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 2fb157aac6af..4b37a0dc7d93 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -262,6 +263,74 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre return kernel; } +void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) { + std::string data = bytes; + dmlc::MemoryStringStream reader(&data); + dmlc::Stream* strm = &reader; + uint64_t kernels_num; + strm->Read(&kernels_num); + cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); + int device_id = t->device.device_id; + for (size_t i = 0; i < kernels_num; ++i) { + std::string name; + std::vector bin_vector; + strm->Read(&name); + strm->Read(&bin_vector); + if (programs_[name][device_id] == nullptr) { + cl_int err = 0; + cl_int binaryStatus; + size_t binarySize = bin_vector.size(); + const unsigned char* programBinary = bin_vector.data(); + + cl_device_id dev = workspace_->devices[device_id]; + programs_[name][device_id] = + clCreateProgramWithBinary(workspace_->context, 1, &dev, &binarySize, + &programBinary, &binaryStatus, &err); + OPENCL_CHECK_ERROR(err); + OPENCL_CHECK_ERROR(binaryStatus); + + err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + size_t len; + std::string log; + clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, + &len); + log.resize(len); + clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0], + nullptr); + LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log; + } + } + } +} + +std::string OpenCLModuleNode::GetPreCompiledPrograms() { + std::string data; + dmlc::MemoryStringStream writer(&data); + dmlc::Stream* strm = &writer; + strm->Write(static_cast(parsed_kernels_.size())); + for (auto& [name, source] : parsed_kernels_) { + cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); + int device_id = t->device.device_id; + t->kernel_table.resize(workspace_->num_registered_kernels); + if (programs_[std::string(name)][device_id] == nullptr) { + InstallKernel(workspace_, t, name, kid_map_[name]); + } + size_t size; + clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, + nullptr); + ICHECK(size > 0) << "Size of binary is 0"; + std::vector bin_vector(size); + unsigned char* binary = bin_vector.data(); + clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, sizeof(unsigned char*), + &binary, nullptr); + + strm->Write(name); + strm->Write(bin_vector); + } + return data; +} + Module OpenCLModuleCreate(std::string data, std::string fmt, std::unordered_map fmap, std::string source) { auto n = make_object(data, fmt, fmap, source); diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc index 2c2768945424..13b7d9470646 100644 --- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc +++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc @@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program (*)(cl_context, cl_uint, const cl using f_clReleaseProgram = cl_int (*)(cl_program); using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, const char*, void (*pfn_notify)(cl_program program, void* user_data), void*); +using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t, void*, size_t*); using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void*, size_t*); using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*); @@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_i } } +cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, + void* param_value, size_t* param_value_size_ret) { + auto& lib = LibOpenCLWrapper::getInstance(); + auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo"); + if (func) { + return func(program, param_name, param_value_size, param_value, param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } +} + cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) { diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc new file mode 100644 index 000000000000..676003ab023f --- /dev/null +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#include + +#include "../src/runtime/opencl/opencl_common.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::cl; + +namespace { +// This kernel was generated by TVM for conv2d operation +const std::string kernelTemplate = R"( +// Function: kernel_name_placeholder0 +__kernel void kernel_name_placeholder0(__write_only image2d_t pad_temp_texture, __read_only image2d_t placeholder0) { + const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + float4 _1 = read_imagef(placeholder0, image_sampler, (int2)(((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) - 1), ((((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 81) * 7) + ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) / 9)) - 1))); + (void)write_imagef(pad_temp_texture, (int2)((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9), (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 9)), (((((9 <= (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81)) && ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) < 72)) && (1 <= (((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9))) && ((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) < 8)) ? _1 : ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); +} + +// Function: kernel_name_placeholder1 +__kernel void kernel_name_placeholder1(__read_only image2d_t pad_temp_texture, __read_only image2d_t placeholder1, __write_only image2d_t compute, __read_only image2d_t placeholder2, __read_only image2d_t placeholder3) { + const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + float4 compute1[14]; + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 0); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 28); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 4); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 32); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 8); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 36); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 12); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 40); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 16); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 44); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 20); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 48); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 24); + vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 52); + for (int rc_inner = 0; rc_inner < 128; ++rc_inner) { + for (int ry_inner = 0; ry_inner < 3; ++ry_inner) { + for (int rx_inner = 0; rx_inner < 3; ++rx_inner) { + for (int rc = 0; rc < 4; ++rc) { + float4 _1 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), ((rc_inner * 9) + ry_inner))); + float4 _2 = read_imagef(placeholder1, image_sampler, (int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), ((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))))); + vstore4((vload4(0, (float*)compute1 + 0) + (((float*)&_1)[rc] * _2)), 0, (float*)compute1 + 0); + float4 _3 = read_imagef(placeholder1, image_sampler, (int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), (((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8))); + vstore4((vload4(0, (float*)compute1 + 28) + (((float*)&_1)[rc] * _3)), 0, (float*)compute1 + 28); + float4 _4 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 1))); + vstore4((vload4(0, (float*)compute1 + 4) + (((float*)&_4)[rc] * _2)), 0, (float*)compute1 + 4); + vstore4((vload4(0, (float*)compute1 + 32) + (((float*)&_4)[rc] * _3)), 0, (float*)compute1 + 32); + float4 _5 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 2))); + vstore4((vload4(0, (float*)compute1 + 8) + (((float*)&_5)[rc] * _2)), 0, (float*)compute1 + 8); + vstore4((vload4(0, (float*)compute1 + 36) + (((float*)&_5)[rc] * _3)), 0, (float*)compute1 + 36); + float4 _6 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 3))); + vstore4((vload4(0, (float*)compute1 + 12) + (((float*)&_6)[rc] * _2)), 0, (float*)compute1 + 12); + vstore4((vload4(0, (float*)compute1 + 40) + (((float*)&_6)[rc] * _3)), 0, (float*)compute1 + 40); + float4 _7 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 4))); + vstore4((vload4(0, (float*)compute1 + 16) + (((float*)&_7)[rc] * _2)), 0, (float*)compute1 + 16); + vstore4((vload4(0, (float*)compute1 + 44) + (((float*)&_7)[rc] * _3)), 0, (float*)compute1 + 44); + float4 _8 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 5))); + vstore4((vload4(0, (float*)compute1 + 20) + (((float*)&_8)[rc] * _2)), 0, (float*)compute1 + 20); + vstore4((vload4(0, (float*)compute1 + 48) + (((float*)&_8)[rc] * _3)), 0, (float*)compute1 + 48); + float4 _9 = read_imagef(pad_temp_texture, image_sampler, (int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 6))); + vstore4((vload4(0, (float*)compute1 + 24) + (((float*)&_9)[rc] * _2)), 0, (float*)compute1 + 24); + vstore4((vload4(0, (float*)compute1 + 52) + (((float*)&_9)[rc] * _3)), 0, (float*)compute1 + 52); + } + } + } + } + float4 _10 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _11 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), ((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7))), max(((vload4(0, (float*)compute1 + 0) * _10) + _11), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _12 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _13 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 56)), max(((vload4(0, (float*)compute1 + 28) * _12) + _13), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _14 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _15 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 1)), max(((vload4(0, (float*)compute1 + 4) * _14) + _15), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _16 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _17 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 57)), max(((vload4(0, (float*)compute1 + 32) * _16) + _17), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _18 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _19 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 2)), max(((vload4(0, (float*)compute1 + 8) * _18) + _19), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _20 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _21 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 58)), max(((vload4(0, (float*)compute1 + 36) * _20) + _21), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _22 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _23 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 3)), max(((vload4(0, (float*)compute1 + 12) * _22) + _23), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _24 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _25 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 59)), max(((vload4(0, (float*)compute1 + 40) * _24) + _25), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _26 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _27 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 4)), max(((vload4(0, (float*)compute1 + 16) * _26) + _27), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _28 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _29 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 60)), max(((vload4(0, (float*)compute1 + 44) * _28) + _29), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _30 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _31 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 5)), max(((vload4(0, (float*)compute1 + 20) * _30) + _31), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _32 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _33 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 61)), max(((vload4(0, (float*)compute1 + 48) * _32) + _33), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _34 = read_imagef(placeholder2, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + float4 _35 = read_imagef(placeholder3, image_sampler, (int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 6)), max(((vload4(0, (float*)compute1 + 24) * _34) + _35), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); + float4 _36 = read_imagef(placeholder2, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + float4 _37 = read_imagef(placeholder3, image_sampler, (int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0)); + (void)write_imagef(compute, (int2)(((int)get_local_id(0)), (((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 62)), max(((vload4(0, (float*)compute1 + 52) * _36) + _37), ((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f)))); +} + + )"; +} + +using Timestamp = std::chrono::time_point; + +class OpenCLCompileBin : public ::testing::Test { +protected: + virtual void SetUp() override { + m_workspace = OpenCLWorkspace::Global(); + OpenCLThreadEntry* t = m_workspace->GetThreadEntry(); + t->kernel_table.resize(m_kernelsNum * 2); + m_kernelNames.resize(m_kernelsNum * 2); + m_dataSrc = ""; + m_fmap.clear(); + for (size_t i = 0; i < m_kernelsNum; ++i) { + std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_"; + std::string kernelSource = std::regex_replace(kernelTemplate, std::regex("kernel_name_placeholder"), kernel_name); + FunctionInfo fi1 = {kernel_name + "0"}; + FunctionInfo fi2 = {kernel_name + "1"}; + m_fmap[fi1.name] = fi1; + m_fmap[fi2.name] = fi2; + m_kernelNames[i * 2] = fi1.name; + m_kernelNames[i * 2 + 1] = fi2.name; + m_dataSrc += kernelSource; + } + } + +protected: + const size_t m_kernelsNum = 100; + const std::string m_tmpDirName = "OpenCLCompileBin_dir"; + OpenCLWorkspace* m_workspace; + std::string m_dataSrc; + std::unordered_map m_fmap; + std::vector m_kernelNames; +}; + +TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { + double compileFromSourceTimeMS, compileFromBinTimeMS; + std::string bytes; + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + EXPECT_TRUE(module.SupportPreCompiledPrograms()); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); + } + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + bytes = module.GetPreCompiledPrograms(); + std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); + compileFromSourceTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from source: " << compileFromSourceTimeMS << " ms." << std::endl; + } + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + EXPECT_TRUE(module.SupportPreCompiledPrograms()); + module.SetPreCompiledPrograms(bytes); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); + } + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); + compileFromBinTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from bin: " << compileFromBinTimeMS << " ms." << std::endl; + } + ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS); +} From bea1c170c5b69310bece54219b455600e32c0733 Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Mon, 30 Jan 2023 13:11:50 +0300 Subject: [PATCH 2/6] Fix lint --- src/runtime/library_module.cc | 3 +- src/runtime/opencl/opencl_module.cc | 5 +- .../opencl/opencl_compile_to_bin.cc | 126 +++++++++--------- 3 files changed, 67 insertions(+), 67 deletions(-) diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc index 042fc637c627..351432cbe95d 100644 --- a/src/runtime/library_module.cc +++ b/src/runtime/library_module.cc @@ -198,8 +198,7 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr lib, // Add all other modules to the import of the root_module for (size_t i = 1; i < modules.size(); ++i) { std::string tkey = modules[i]->type_key(); - if (tkey != "_lib" && tkey != "_import_tree") - root_module->Import(modules[i]); + if (tkey != "_lib" && tkey != "_import_tree") root_module->Import(modules[i]); } } diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 4b37a0dc7d93..cc75d47244c4 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -283,9 +283,8 @@ void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) { const unsigned char* programBinary = bin_vector.data(); cl_device_id dev = workspace_->devices[device_id]; - programs_[name][device_id] = - clCreateProgramWithBinary(workspace_->context, 1, &dev, &binarySize, - &programBinary, &binaryStatus, &err); + programs_[name][device_id] = clCreateProgramWithBinary( + workspace_->context, 1, &dev, &binarySize, &programBinary, &binaryStatus, &err); OPENCL_CHECK_ERROR(err); OPENCL_CHECK_ERROR(binaryStatus); diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc index 676003ab023f..09c4fc48ec0d 100644 --- a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -17,13 +17,12 @@ * under the License. */ -#include -#include -#include - #include #include +#include +#include + #include "../src/runtime/opencl/opencl_common.h" using namespace tvm::runtime; @@ -133,73 +132,76 @@ __kernel void kernel_name_placeholder1(__read_only image2d_t pad_temp_texture, _ } )"; -} +} // namespace using Timestamp = std::chrono::time_point; class OpenCLCompileBin : public ::testing::Test { -protected: - virtual void SetUp() override { - m_workspace = OpenCLWorkspace::Global(); - OpenCLThreadEntry* t = m_workspace->GetThreadEntry(); - t->kernel_table.resize(m_kernelsNum * 2); - m_kernelNames.resize(m_kernelsNum * 2); - m_dataSrc = ""; - m_fmap.clear(); - for (size_t i = 0; i < m_kernelsNum; ++i) { - std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_"; - std::string kernelSource = std::regex_replace(kernelTemplate, std::regex("kernel_name_placeholder"), kernel_name); - FunctionInfo fi1 = {kernel_name + "0"}; - FunctionInfo fi2 = {kernel_name + "1"}; - m_fmap[fi1.name] = fi1; - m_fmap[fi2.name] = fi2; - m_kernelNames[i * 2] = fi1.name; - m_kernelNames[i * 2 + 1] = fi2.name; - m_dataSrc += kernelSource; - } + protected: + virtual void SetUp() override { + m_workspace = OpenCLWorkspace::Global(); + OpenCLThreadEntry* t = m_workspace->GetThreadEntry(); + t->kernel_table.resize(m_kernelsNum * 2); + m_kernelNames.resize(m_kernelsNum * 2); + m_dataSrc = ""; + m_fmap.clear(); + for (size_t i = 0; i < m_kernelsNum; ++i) { + std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_"; + std::string kernelSource = + std::regex_replace(kernelTemplate, std::regex("kernel_name_placeholder"), kernel_name); + FunctionInfo fi1 = {kernel_name + "0"}; + FunctionInfo fi2 = {kernel_name + "1"}; + m_fmap[fi1.name] = fi1; + m_fmap[fi2.name] = fi2; + m_kernelNames[i * 2] = fi1.name; + m_kernelNames[i * 2 + 1] = fi2.name; + m_dataSrc += kernelSource; } + } -protected: - const size_t m_kernelsNum = 100; - const std::string m_tmpDirName = "OpenCLCompileBin_dir"; - OpenCLWorkspace* m_workspace; - std::string m_dataSrc; - std::unordered_map m_fmap; - std::vector m_kernelNames; + protected: + const size_t m_kernelsNum = 100; + const std::string m_tmpDirName = "OpenCLCompileBin_dir"; + OpenCLWorkspace* m_workspace; + std::string m_dataSrc; + std::unordered_map m_fmap; + std::vector m_kernelNames; }; TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { - double compileFromSourceTimeMS, compileFromBinTimeMS; - std::string bytes; - { - OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); - module.Init(); - EXPECT_TRUE(module.SupportPreCompiledPrograms()); - Timestamp comp_start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < m_kernelNames.size(); ++i) { - OpenCLModuleNode::KTRefEntry e = {i, 1}; - module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); - } - Timestamp comp_end = std::chrono::high_resolution_clock::now(); - bytes = module.GetPreCompiledPrograms(); - std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); - compileFromSourceTimeMS = duration.count() * 1e-6; - std::cout << "Compile time from source: " << compileFromSourceTimeMS << " ms." << std::endl; + double compileFromSourceTimeMS, compileFromBinTimeMS; + std::string bytes; + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + EXPECT_TRUE(module.SupportPreCompiledPrograms()); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); } - { - OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); - module.Init(); - EXPECT_TRUE(module.SupportPreCompiledPrograms()); - module.SetPreCompiledPrograms(bytes); - Timestamp comp_start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < m_kernelNames.size(); ++i) { - OpenCLModuleNode::KTRefEntry e = {i, 1}; - module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); - } - Timestamp comp_end = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); - compileFromBinTimeMS = duration.count() * 1e-6; - std::cout << "Compile time from bin: " << compileFromBinTimeMS << " ms." << std::endl; + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + bytes = module.GetPreCompiledPrograms(); + std::chrono::duration duration = + std::chrono::duration_cast(comp_end - comp_start); + compileFromSourceTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from source: " << compileFromSourceTimeMS << " ms." << std::endl; + } + { + OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); + module.Init(); + EXPECT_TRUE(module.SupportPreCompiledPrograms()); + module.SetPreCompiledPrograms(bytes); + Timestamp comp_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < m_kernelNames.size(); ++i) { + OpenCLModuleNode::KTRefEntry e = {i, 1}; + module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); } - ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS); + Timestamp comp_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = + std::chrono::duration_cast(comp_end - comp_start); + compileFromBinTimeMS = duration.count() * 1e-6; + std::cout << "Compile time from bin: " << compileFromBinTimeMS << " ms." << std::endl; + } + ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS); } From a35988fee57bbee9cc87736cfa8c7ea945ee45e5 Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Wed, 1 Feb 2023 12:55:37 +0300 Subject: [PATCH 3/6] Apply comment: PackedFunc is used --- apps/cpp_rtvm/tvm_runner.cc | 8 ++++--- include/tvm/runtime/module.h | 21 ------------------- src/runtime/module.cc | 8 ------- src/runtime/opencl/opencl_common.h | 5 ++--- src/runtime/opencl/opencl_module.cc | 7 +++++++ .../opencl/opencl_compile_to_bin.cc | 7 +++---- 6 files changed, 17 insertions(+), 39 deletions(-) diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index dd89244e65d1..965c793a1535 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -126,19 +126,21 @@ void TVMRunner::UsePreCompiledPrograms(std::string pathToDir) { ICHECK(std::filesystem::create_directories(pathToDir) == true); std::filesystem::path binary_path = pathToDir; for (tvm::runtime::Module mod : r_mod_handle->imports()) { - if (mod->SupportPreCompiledPrograms()) { + auto f_get = mod->GetFunction("__GetPreCompiledPrograms"); + auto f_set = mod->GetFunction("__SetPreCompiledPrograms"); + if (f_get != nullptr && f_set != nullptr) { std::string file_name = "pre_compiled_"; file_name += mod->type_key(); file_name += ".bin"; auto file_path = binary_path / file_name; if (!std::filesystem::exists(file_path)) { - auto bytes = mod->GetPreCompiledPrograms(); + auto bytes = String(f_get()); std::ofstream fs(file_path.string(), std::ofstream::binary); fs.write(bytes.c_str(), bytes.size()); } else { std::ifstream ifs(file_path.string(), std::ios::in | std::ios::binary); std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); - mod->SetPreCompiledPrograms(bytes); + f_set(String(bytes)); } } } diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index ba1a740c7a06..a54f98a558f3 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -192,27 +192,6 @@ class TVM_DLL ModuleNode : public Object { /*! \return The module it imports from */ const std::vector& imports() const { return imports_; } - /*! - * \brief Returns true if this module supports building from pre-compiled programs. - * - * The default implementation returns false. - */ - virtual bool SupportPreCompiledPrograms() const { return false; } - - /*! - * \brief Pass pre-compiled programs which module will use to speed up compilation time. - * \param bytes string with bytes of pre-compiled programs. - */ - virtual void SetPreCompiledPrograms(const std::string& bytes); - - /*! - * \brief Get a pre-compiled programs for a module. - * If there are no compiled programs then first the programs will be compiled. - * - * \return The string with bytes of pre-compiled programs. - */ - virtual std::string GetPreCompiledPrograms(); - /*! * \brief Returns true if this module is 'DSO exportable'. * diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 1f6a7347e2f1..9ef57e905324 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -128,14 +128,6 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) { } } -void ModuleNode::SetPreCompiledPrograms(const std::string& bytes) { - LOG(FATAL) << "Module[" << type_key() << "] does not support using pre-compiled programs"; -} - -std::string ModuleNode::GetPreCompiledPrograms() { - LOG(FATAL) << "Module[" << type_key() << "] does not support using pre-compiled programs"; -} - std::string ModuleNode::GetFormat() { LOG(FATAL) << "Module[" << type_key() << "] does not support GetFormat"; } diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index aad355d343a7..a295ea396cd0 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -438,9 +438,8 @@ class OpenCLModuleNode : public ModuleNode { // install a new kernel to thread local entry cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, const std::string& func_name, const KTRefEntry& e); - bool SupportPreCompiledPrograms() const final { return true; } - void SetPreCompiledPrograms(const std::string& bytes) final; - std::string GetPreCompiledPrograms() final; + void SetPreCompiledPrograms(const std::string& bytes); + std::string GetPreCompiledPrograms(); private: // The workspace, need to keep reference to use it in destructor. diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index cc75d47244c4..33e3128d3fcf 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -138,6 +138,13 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() { PackedFunc OpenCLModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { ICHECK_EQ(sptr_to_self.get(), this); + if (name == "__GetPreCompiledPrograms") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetPreCompiledPrograms(); }); + } else if (name == "__SetPreCompiledPrograms") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->SetPreCompiledPrograms(args[0]); }); + } ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; auto it = fmap_.find(name); if (it == fmap_.end()) return PackedFunc(); diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc index 09c4fc48ec0d..75e96aff8db1 100644 --- a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -174,14 +174,14 @@ TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { { OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); module.Init(); - EXPECT_TRUE(module.SupportPreCompiledPrograms()); Timestamp comp_start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < m_kernelNames.size(); ++i) { OpenCLModuleNode::KTRefEntry e = {i, 1}; module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); } Timestamp comp_end = std::chrono::high_resolution_clock::now(); - bytes = module.GetPreCompiledPrograms(); + auto get_pre_compiled_f = module.GetFunction("__GetPreCompiledPrograms", GetObjectPtr(&module)); + bytes = String(get_pre_compiled_f()); std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); compileFromSourceTimeMS = duration.count() * 1e-6; @@ -190,8 +190,7 @@ TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { { OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); module.Init(); - EXPECT_TRUE(module.SupportPreCompiledPrograms()); - module.SetPreCompiledPrograms(bytes); + module.GetFunction("__SetPreCompiledPrograms", GetObjectPtr(&module))(String(bytes)); Timestamp comp_start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < m_kernelNames.size(); ++i) { OpenCLModuleNode::KTRefEntry e = {i, 1}; From 611a320d6f07192f5ea91d5f44a0a45e1f72a188 Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Wed, 1 Feb 2023 13:15:18 +0300 Subject: [PATCH 4/6] Fix build --- src/runtime/opencl/opencl_module.cc | 14 ++++++++------ tests/cpp-runtime/opencl/opencl_compile_to_bin.cc | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 33e3128d3fcf..3ada6bee03c9 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -25,7 +25,6 @@ #include #include -#include #include #include #include @@ -139,11 +138,13 @@ PackedFunc OpenCLModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { ICHECK_EQ(sptr_to_self.get(), this); if (name == "__GetPreCompiledPrograms") { - return PackedFunc( - [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetPreCompiledPrograms(); }); + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + *rv = this->GetPreCompiledPrograms(); + }); } else if (name == "__SetPreCompiledPrograms") { - return PackedFunc( - [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->SetPreCompiledPrograms(args[0]); }); + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + this->SetPreCompiledPrograms(args[0]); + }); } ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main"; auto it = fmap_.find(name); @@ -315,7 +316,8 @@ std::string OpenCLModuleNode::GetPreCompiledPrograms() { dmlc::MemoryStringStream writer(&data); dmlc::Stream* strm = &writer; strm->Write(static_cast(parsed_kernels_.size())); - for (auto& [name, source] : parsed_kernels_) { + for (auto& it : parsed_kernels_) { + std::string name = it.first; cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); int device_id = t->device.device_id; t->kernel_table.resize(workspace_->num_registered_kernels); diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc index 75e96aff8db1..62a2208cddb8 100644 --- a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -180,7 +180,8 @@ TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), m_kernelNames[i], e); } Timestamp comp_end = std::chrono::high_resolution_clock::now(); - auto get_pre_compiled_f = module.GetFunction("__GetPreCompiledPrograms", GetObjectPtr(&module)); + auto get_pre_compiled_f = + module.GetFunction("__GetPreCompiledPrograms", GetObjectPtr(&module)); bytes = String(get_pre_compiled_f()); std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); From 499e3587a9eb9af8599398d7e7840ab4ee769586 Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Wed, 1 Feb 2023 16:05:51 +0300 Subject: [PATCH 5/6] Fix CI and rename functions --- apps/cpp_rtvm/tvm_runner.cc | 30 ++++++++----------- src/runtime/library_module.cc | 5 ---- src/runtime/opencl/opencl_module.cc | 4 +-- .../opencl/opencl_compile_to_bin.cc | 5 ++-- 4 files changed, 18 insertions(+), 26 deletions(-) diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index 965c793a1535..c61ed9ec7f2c 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -125,23 +125,19 @@ void TVMRunner::UsePreCompiledPrograms(std::string pathToDir) { if (!std::filesystem::exists(pathToDir)) ICHECK(std::filesystem::create_directories(pathToDir) == true); std::filesystem::path binary_path = pathToDir; - for (tvm::runtime::Module mod : r_mod_handle->imports()) { - auto f_get = mod->GetFunction("__GetPreCompiledPrograms"); - auto f_set = mod->GetFunction("__SetPreCompiledPrograms"); - if (f_get != nullptr && f_set != nullptr) { - std::string file_name = "pre_compiled_"; - file_name += mod->type_key(); - file_name += ".bin"; - auto file_path = binary_path / file_name; - if (!std::filesystem::exists(file_path)) { - auto bytes = String(f_get()); - std::ofstream fs(file_path.string(), std::ofstream::binary); - fs.write(bytes.c_str(), bytes.size()); - } else { - std::ifstream ifs(file_path.string(), std::ios::in | std::ios::binary); - std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); - f_set(String(bytes)); - } + auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", true); + auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", true); + if (f_get != nullptr && f_set != nullptr) { + std::string file_name = "pre_compiled.bin"; + auto file_path = binary_path / file_name; + if (!std::filesystem::exists(file_path)) { + auto bytes = String(f_get()); + std::ofstream fs(file_path.string(), std::ofstream::binary); + fs.write(bytes.c_str(), bytes.size()); + } else { + std::ifstream ifs(file_path.string(), std::ios::in | std::ios::binary); + std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); + f_set(String(bytes)); } } } diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc index 351432cbe95d..54fd362387c5 100644 --- a/src/runtime/library_module.cc +++ b/src/runtime/library_module.cc @@ -195,11 +195,6 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr lib, // The module order is collected via DFS *root_module = modules[0]; } - // Add all other modules to the import of the root_module - for (size_t i = 1; i < modules.size(); ++i) { - std::string tkey = modules[i]->type_key(); - if (tkey != "_lib" && tkey != "_import_tree") root_module->Import(modules[i]); - } } Module CreateModuleFromLibrary(ObjectPtr lib, PackedFuncWrapper packed_func_wrapper) { diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 3ada6bee03c9..ad41a34dde4e 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -137,11 +137,11 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() { PackedFunc OpenCLModuleNode::GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { ICHECK_EQ(sptr_to_self.get(), this); - if (name == "__GetPreCompiledPrograms") { + if (name == "opencl.GetPreCompiledPrograms") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetPreCompiledPrograms(); }); - } else if (name == "__SetPreCompiledPrograms") { + } else if (name == "opencl.SetPreCompiledPrograms") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->SetPreCompiledPrograms(args[0]); }); diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc index 62a2208cddb8..a1bdeb9c1408 100644 --- a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc +++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc @@ -181,7 +181,7 @@ TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { } Timestamp comp_end = std::chrono::high_resolution_clock::now(); auto get_pre_compiled_f = - module.GetFunction("__GetPreCompiledPrograms", GetObjectPtr(&module)); + module.GetFunction("opencl.GetPreCompiledPrograms", GetObjectPtr(&module)); bytes = String(get_pre_compiled_f()); std::chrono::duration duration = std::chrono::duration_cast(comp_end - comp_start); @@ -191,7 +191,8 @@ TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) { { OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string()); module.Init(); - module.GetFunction("__SetPreCompiledPrograms", GetObjectPtr(&module))(String(bytes)); + module.GetFunction("opencl.SetPreCompiledPrograms", + GetObjectPtr(&module))(String(bytes)); Timestamp comp_start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < m_kernelNames.size(); ++i) { OpenCLModuleNode::KTRefEntry e = {i, 1}; From d41fb1222426dc52d7f6ac5bd2160cd077e118de Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Thu, 2 Feb 2023 10:52:21 +0300 Subject: [PATCH 6/6] Apply comments --- apps/cpp_rtvm/README.md | 10 ++++++---- apps/cpp_rtvm/main.cc | 10 +++++++++- apps/cpp_rtvm/tvm_runner.cc | 16 +++++----------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md index db58388b2717..c60a7b0e12f5 100644 --- a/apps/cpp_rtvm/README.md +++ b/apps/cpp_rtvm/README.md @@ -360,7 +360,9 @@ about 26 seconds. But after dumping compiled programs to binary files and reuse them on the next runs, the compilation time was significantly decreased (more than 1000 times) and starts to be around 25 ms. -To use such functionality, the developer have to set directory where the -pre-compiled programs will be stored. To the `rtvm` application such example was -added. After method `Load`, method `UsePreCompiledProgram` is called. This -method passes directory where the pre-compiled kernels should be stored. +To use such functionality, the developer have to pass parameter `--pre-compiled` +to the `rtvm` and specify the file name where pre-compiled programs will be +stored. If the pre-compiled file name was passed to the `rtvm` then After method +`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled +programs if the file exists. In opposite case the file will be created and +pre-compiled programs will be saved to this file. diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc index 332bfee7c5b6..c38a5f62bd9a 100644 --- a/apps/cpp_rtvm/main.cc +++ b/apps/cpp_rtvm/main.cc @@ -54,6 +54,7 @@ static const string kUsage = "--input - Numpy file for the model input (optional and we use random of not given)\n" "--output - Numpy file name to dump the model output as numpy\n" "--dump-meta - Dump model meta information\n" + "--pre-compiled - The file name of a file where pre-compiled programs should be stored" "\n" " Example\n" " ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n" @@ -66,12 +67,14 @@ static const string kUsage = * \arg device The target device to use {llvm, cl, ...etc.} * \arg input Numpy file for the model input * \arg output Numpy file name to dump the model output as numpy + * \arg pre_compiled File name where pre-compiled programs should be stored */ struct ToolArgs { string model; string device; string input; string output; + string pre_compiled; bool dump_meta = false; }; @@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) { LOG(INFO) << "Device = " << args.device; LOG(INFO) << "Input = " << args.input; LOG(INFO) << "Output = " << args.output; + LOG(INFO) << "Pre-compiled = " << args.pre_compiled; LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False")); } @@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) { if (!pmeta.empty()) { args.dump_meta = true; } + + args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled="); } /*! @@ -190,7 +196,9 @@ int ExecuteModel(ToolArgs& args) { // Load the model runner.Load(); - runner.UsePreCompiledPrograms("pre_compiled"); + if (!args.pre_compiled.empty()) { + runner.UsePreCompiledPrograms(args.pre_compiled); + } // Query Model meta Information TVMMetaInfo mInfo = runner.GetMetaInfo(); diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index c61ed9ec7f2c..2fd4f2281e01 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -26,7 +26,6 @@ #include -#include #include #include #include @@ -115,27 +114,22 @@ int TVMRunner::Load(void) { /*! * \brief Specify if the run programs should be dumped to binary and reused in the next runs. - * \param pathToDir Path to the existed directory where pre-compiled programs should be stored. + * \param file_name File name where pre-compiled programs should be stored. */ -void TVMRunner::UsePreCompiledPrograms(std::string pathToDir) { +void TVMRunner::UsePreCompiledPrograms(std::string file_name) { if (r_run_was_called) { LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run"; return; } - if (!std::filesystem::exists(pathToDir)) - ICHECK(std::filesystem::create_directories(pathToDir) == true); - std::filesystem::path binary_path = pathToDir; auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", true); auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", true); if (f_get != nullptr && f_set != nullptr) { - std::string file_name = "pre_compiled.bin"; - auto file_path = binary_path / file_name; - if (!std::filesystem::exists(file_path)) { + std::ifstream ifs(file_name, std::ios::in | std::ios::binary); + if (ifs.fail()) { auto bytes = String(f_get()); - std::ofstream fs(file_path.string(), std::ofstream::binary); + std::ofstream fs(file_name, std::ofstream::binary); fs.write(bytes.c_str(), bytes.size()); } else { - std::ifstream ifs(file_path.string(), std::ios::in | std::ios::binary); std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); f_set(String(bytes)); }