Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL] Implement save/load pre-compiled programs #13868

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions apps/cpp_rtvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,17 @@ python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchai
python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time

```

# Use pre-compiled OpenCL kernels
Using pre-compiled programs might significantly improve inference time of the
first run. E.g. for topology with ~300 kernels compilation time on Adreno was
about 26 seconds. But after dumping compiled programs to binary files and reuse
them on the next runs, the compilation time was significantly decreased (more
than 1000 times) and starts to be around 25 ms.

To use such functionality, the developer have to pass parameter `--pre-compiled`
to the `rtvm` and specify the file name where pre-compiled programs will be
stored. If the pre-compiled file name was passed to the `rtvm` then After method
`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
programs if the file exists. In opposite case the file will be created and
pre-compiled programs will be saved to this file.
9 changes: 9 additions & 0 deletions apps/cpp_rtvm/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ static const string kUsage =
"--input - Numpy file for the model input (optional and we use random of not given)\n"
"--output - Numpy file name to dump the model output as numpy\n"
"--dump-meta - Dump model meta information\n"
"--pre-compiled - The file name of a file where pre-compiled programs should be stored"
"\n"
" Example\n"
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
Expand All @@ -66,12 +67,14 @@ static const string kUsage =
* \arg device The target device to use {llvm, cl, ...etc.}
* \arg input Numpy file for the model input
* \arg output Numpy file name to dump the model output as numpy
* \arg pre_compiled File name where pre-compiled programs should be stored
*/
struct ToolArgs {
string model;
string device;
string input;
string output;
string pre_compiled;
bool dump_meta = false;
};

Expand All @@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) {
LOG(INFO) << "Device = " << args.device;
LOG(INFO) << "Input = " << args.input;
LOG(INFO) << "Output = " << args.output;
LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
}

Expand Down Expand Up @@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
if (!pmeta.empty()) {
args.dump_meta = true;
}

args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
}

/*!
Expand All @@ -190,6 +196,9 @@ int ExecuteModel(ToolArgs& args) {

// Load the model
runner.Load();
if (!args.pre_compiled.empty()) {
runner.UsePreCompiledPrograms(args.pre_compiled);
}

// Query Model meta Information
TVMMetaInfo mInfo = runner.GetMetaInfo();
Expand Down
29 changes: 28 additions & 1 deletion apps/cpp_rtvm/tvm_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cnpy.h>

#include <fstream>
#include <iterator>
#include <streambuf>
#include <string>

Expand Down Expand Up @@ -67,7 +68,8 @@ int GetTVMDevice(std::string device) {
* \param path where the tfm compiler artifacts present.
* \param device the target device where we need to load the compiled model.
*/
TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) {
TVMRunner::TVMRunner(std::string path, std::string device)
: r_model_path(path), r_device(device), r_run_was_called(false) {
LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device;
}

Expand Down Expand Up @@ -110,6 +112,30 @@ int TVMRunner::Load(void) {
return 0;
}

/*!
* \brief Specify if the run programs should be dumped to binary and reused in the next runs.
* \param file_name File name where pre-compiled programs should be stored.
*/
void TVMRunner::UsePreCompiledPrograms(std::string file_name) {
if (r_run_was_called) {
LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run";
return;
}
auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", true);
auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", true);
if (f_get != nullptr && f_set != nullptr) {
std::ifstream ifs(file_name, std::ios::in | std::ios::binary);
if (ifs.fail()) {
auto bytes = String(f_get());
std::ofstream fs(file_name, std::ofstream::binary);
fs.write(bytes.c_str(), bytes.size());
} else {
std::string bytes((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
f_set(String(bytes));
}
}
}

/*!
* \brief Calculated the memory size for the NDArray.
* \param NDArray object.
Expand Down Expand Up @@ -242,6 +268,7 @@ int TVMRunner::GetOutput(std::string output_id, char* raw_output) {
*/
int TVMRunner::Run(void) {
LOG(INFO) << "TVMRunner::Run";
r_run_was_called = true;

r_graph_handle.GetFunction("run")();
return 0;
Expand Down
4 changes: 4 additions & 0 deletions apps/cpp_rtvm/tvm_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class TVMRunner {

/*! \brief Initiates graph runtime and with the compiled model */
int Load(void);
/*! \brief Specify if the run programs should be dumped to binary and reused in the next runs */
void UsePreCompiledPrograms(std::string);
/*! \brief Executes one inference cycle */
int Run(void);
/*! \brief To set the inputs from given npz file */
Expand Down Expand Up @@ -86,6 +88,8 @@ class TVMRunner {
std::string r_device;
/*! \brief Holds meta information queried from graph runtime */
TVMMetaInfo mInfo;
/*! \brief Mark if the run method was called */
bool r_run_was_called;
};

} // namespace runtime
Expand Down
2 changes: 2 additions & 0 deletions src/runtime/opencl/opencl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ class OpenCLModuleNode : public ModuleNode {
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t,
const std::string& func_name, const KTRefEntry& e);
void SetPreCompiledPrograms(const std::string& bytes);
std::string GetPreCompiledPrograms();

private:
// The workspace, need to keep reference to use it in destructor.
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
cl_int err_code;
desc->host_ptr = reinterpret_cast<cl_uchar*>(
clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0,
sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code));
sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code));
OPENCL_CHECK_ERROR(err_code);
#endif // OPENCL_ENABLE_HOST_PTR
return desc;
Expand Down Expand Up @@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
if (desc->host_ptr) {
clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
reinterpret_cast<void*>(desc->host_ptr), 0, NULL, NULL);
reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
}
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
Expand Down
77 changes: 77 additions & 0 deletions src/runtime/opencl/opencl_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ cl::OpenCLWorkspace* OpenCLModuleNode::GetGlobalWorkspace() {
PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
ICHECK_EQ(sptr_to_self.get(), this);
if (name == "opencl.GetPreCompiledPrograms") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
*rv = this->GetPreCompiledPrograms();
});
} else if (name == "opencl.SetPreCompiledPrograms") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
this->SetPreCompiledPrograms(args[0]);
});
}
ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have main";
auto it = fmap_.find(name);
if (it == fmap_.end()) return PackedFunc();
Expand Down Expand Up @@ -262,6 +271,74 @@ cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
return kernel;
}

void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
std::string data = bytes;
dmlc::MemoryStringStream reader(&data);
dmlc::Stream* strm = &reader;
uint64_t kernels_num;
strm->Read(&kernels_num);
cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
int device_id = t->device.device_id;
for (size_t i = 0; i < kernels_num; ++i) {
std::string name;
std::vector<unsigned char> bin_vector;
strm->Read(&name);
strm->Read(&bin_vector);
if (programs_[name][device_id] == nullptr) {
cl_int err = 0;
cl_int binaryStatus;
size_t binarySize = bin_vector.size();
const unsigned char* programBinary = bin_vector.data();

cl_device_id dev = workspace_->devices[device_id];
programs_[name][device_id] = clCreateProgramWithBinary(
workspace_->context, 1, &dev, &binarySize, &programBinary, &binaryStatus, &err);
OPENCL_CHECK_ERROR(err);
OPENCL_CHECK_ERROR(binaryStatus);

err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t len;
std::string log;
clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr,
&len);
log.resize(len);
clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0],
nullptr);
LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log;
}
}
}
}

std::string OpenCLModuleNode::GetPreCompiledPrograms() {
std::string data;
dmlc::MemoryStringStream writer(&data);
dmlc::Stream* strm = &writer;
strm->Write(static_cast<uint64_t>(parsed_kernels_.size()));
for (auto& it : parsed_kernels_) {
std::string name = it.first;
cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
int device_id = t->device.device_id;
t->kernel_table.resize(workspace_->num_registered_kernels);
if (programs_[std::string(name)][device_id] == nullptr) {
InstallKernel(workspace_, t, name, kid_map_[name]);
}
size_t size;
clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size,
nullptr);
ICHECK(size > 0) << "Size of binary is 0";
std::vector<unsigned char> bin_vector(size);
unsigned char* binary = bin_vector.data();
clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, sizeof(unsigned char*),
&binary, nullptr);

strm->Write(name);
strm->Write(bin_vector);
}
return data;
}

Module OpenCLModuleCreate(std::string data, std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap, std::string source) {
auto n = make_object<OpenCLModuleNode>(data, fmt, fmap, source);
Expand Down
12 changes: 12 additions & 0 deletions src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program (*)(cl_context, cl_uint, const cl
using f_clReleaseProgram = cl_int (*)(cl_program);
using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, const char*,
void (*pfn_notify)(cl_program program, void* user_data), void*);
using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t, void*, size_t*);
using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t,
void*, size_t*);
using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*);
Expand Down Expand Up @@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_i
}
}

cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size,
void* param_value, size_t* param_value_size_ret) {
auto& lib = LibOpenCLWrapper::getInstance();
auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo");
if (func) {
return func(program, param_name, param_value_size, param_value, param_value_size_ret);
} else {
return CL_INVALID_PLATFORM;
}
}

cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
cl_program_build_info param_name, size_t param_value_size,
void* param_value, size_t* param_value_size_ret) {
Expand Down
Loading