Skip to content

Commit

Permalink
TensorRT EP engine cache serialization/deserialization refactor (#11045)
Browse files Browse the repository at this point in the history
* Code refactor

* fix bug

* modify comment

* modify test for the new ORT TRT cache behavior

* update comment

* rename variable

* fix bug for not having trt context

* Custom parameters (#10964)

* get inputs independently for trtexec

* track one process only

* remove engine and profile files

* change time to commit time

* add runtime option for io binding

* move to commit date

* fixes

* add option for graph optimization

* cleanup docker script

* note second time creation

* allow for parameters to be configured from pipeline at runtime

* uncomment

* include optional arguments at runtime

* post second session creation

* update cmake version

* Revert "update cmake version"

This reverts commit 09a1364.

* Move data format import

* Perf FasterRCNN + MaskRCNN (#11102)

* add faster mask

* fix paths

* add a test scenario that - if engine cache is present, trt ep should load the engine cache and run inference

* Revert "Merge branch 'trt_cache_refactor' of https://github.com/microsoft/onnxruntime into trt_cache_refactor"

This reverts commit 8edc574, reversing
changes made to 0c92e5b.

Co-authored-by: Olivia Jain <[email protected]>
  • Loading branch information
chilo-ms and oliviajain authored Apr 26, 2022
1 parent 81d7870 commit 0292356
Show file tree
Hide file tree
Showing 3 changed files with 287 additions and 217 deletions.
200 changes: 85 additions & 115 deletions onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>> input_shape_ranges;
std::unordered_map<std::string, size_t> output_indexes(num_outputs);
std::unordered_map<std::string, size_t> output_types(num_outputs);
bool update_engine_cache = false;

// Initialize shape range for dynamic shape tensors
bool has_dynamic_shape = false;
Expand Down Expand Up @@ -1158,15 +1159,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
}
}

// Build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will
// be built at runtime
// If engine cache enable is set,
// load and deserialize TRT engine cache regardless of the graph has dynamic shape input or not
tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine> trt_engine;
tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext> trt_context;
if (!has_dynamic_shape) {
if (engine_cache_enable_) {
const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
if (engine_cache_enable_ && engine_file) {
if (engine_file) {
engine_file.seekg(0, std::ios::end);
size_t engine_size = engine_file.tellg();
engine_file.seekg(0, std::ios::beg);
Expand All @@ -1178,7 +1179,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
}
} else if (engine_decryption_enable_ && engine_cache_enable_ && !engine_file) {
} else if (engine_decryption_enable_ && !engine_file) {
// Decrypt engine
size_t engine_size = 0;
if (!engine_decryption_(engine_cache_path.c_str(), nullptr, &engine_size)) {
Expand All @@ -1197,7 +1198,34 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not deserialize engine from encrypted cache: " + engine_cache_path);
}
} else {
}

if (trt_engine != nullptr) {
// Build context
trt_context = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
if (trt_context == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build execution context for fused node: " + fused_node->Name());
}
}

// If graph has dynamic shape input,
// load and deserialize TRT engine profile cache
if (has_dynamic_shape) {
const std::string profile_cache_path = cache_path + ".profile";
std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
if (profile_file) {
input_shape_ranges = DeserializeProfile(profile_file);
}
}
}


// If (1) engine cache enable is not set or (2) first time enable engine cache and no engine cache is present,
// build TRT engine here if the graph doesn't have dynamic shape input. Otherwise engine will
// be built at runtime
if (!has_dynamic_shape) {
if (trt_engine == nullptr) {
// Set INT8 per tensor dynamic range
if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
trt_config->setInt8Calibrator(nullptr);
Expand All @@ -1216,29 +1244,16 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build engine for fused node: " + fused_node->Name());
}
if (engine_cache_enable_) {
nvinfer1::IHostMemory* serializedModel = trt_engine->serialize();
size_t engine_size = serializedModel->size();
if (engine_decryption_enable_) {
// Encrypt engine
if (!engine_encryption_(engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not call engine encryption function encrypt");
}
} else {
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
}
serializedModel->destroy();
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
}
}

// Build context
trt_context = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
if (trt_context == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build execution context for fused node: " + fused_node->Name());
if (engine_cache_enable_)
update_engine_cache = true;

// Build context
trt_context = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
if (trt_context == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build execution context for fused node: " + fused_node->Name());
}
}
}

Expand Down Expand Up @@ -1285,16 +1300,52 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
runtime_.get(), nullptr, allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
runtime_.get(), nullptr, allocator_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_,
update_engine_cache};
*state = p.release();
return 0;
};

// Release function state
compute_info.release_state_func = [](FunctionState state) {
if (state)
if (state) {
// Serialize and save engine to cache
//
// Note: only save engine to file if engine cache enable is set and engine is being updated due to input shape changed
// or engine file is not previously existed
TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
if (trt_state->update_engine_cache) {
// Serialize engine
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
nvinfer1::IHostMemory* serializedModel = trt_state->engine->get()->serialize();
size_t engine_size = serializedModel->size();
if (trt_state->engine_decryption_enable) {
// Encrypt engine
if (!trt_state->engine_encryption(engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
delete static_cast<TensorrtFuncState*>(state);
ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not call engine encryption function encrypt"));
}
} else {
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
}
serializedModel->destroy();
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;

// Serialize engine profile if needed
if (!trt_state->input_shape_ranges.empty()) {
const std::string profile_cache_path = cache_path + ".profile";
SerializeProfile(profile_cache_path, trt_state->input_shape_ranges);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
}
}

delete static_cast<TensorrtFuncState*>(state);
}
};

// Create compute function
compute_info.compute_func = [this](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) {
Ort::CustomOpApi ort{*api};
Expand All @@ -1317,71 +1368,6 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse

cudaStream_t stream = static_cast<cudaStream_t>(this->GetComputeStream());

// Load serialized engine
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
const std::string profile_cache_path = cache_path + ".profile";
if (trt_state->engine_cache_enable && trt_engine == nullptr) {
std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
if (engine_file && profile_file) {
// Deserialize profile
shape_ranges = DeserializeProfile(profile_file);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
// Deserialize engine
trt_state->context->reset();
trt_state->engine->reset();
engine_file.seekg(0, std::ios::end);
size_t engine_size = engine_file.tellg();
engine_file.seekg(0, std::ios::beg);
std::unique_ptr<char[]> engine_buf{new char[engine_size]};
engine_file.read((char*)engine_buf.get(), engine_size);
*(trt_state->engine) = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>(
trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
if (trt_state->engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
trt_engine = trt_state->engine->get();
*(trt_state->context) = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(
trt_state->engine->get()->createExecutionContext());
if (trt_state->context == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
}
trt_context = trt_state->context->get();
} else if (trt_state->engine_decryption_enable && !engine_file && profile_file) {
shape_ranges = DeserializeProfile(profile_file);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
// Decrypt engine
size_t engine_size = 0;
if (!trt_state->engine_decryption(engine_cache_path.c_str(), nullptr, &engine_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not get engine buffer size");
}
std::unique_ptr<char[]> engine_buf{new char[engine_size]};
if (!trt_state->engine_decryption(engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not call engine decryption function decrypt");
}
// Deserialize engine
trt_state->context->reset();
trt_state->engine->reset();
*(trt_state->engine) = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
if (trt_state->engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not deserialize engine from encrypted cache: " + engine_cache_path);
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
trt_engine = trt_state->engine->get();
*(trt_state->context) = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(
trt_state->engine->get()->createExecutionContext());
if (trt_state->context == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
}
trt_context = trt_state->context->get();
}
}

for (int i = 0, end = num_inputs; i < end; ++i) {
auto input = trt_state->network->get()->getInput(i);
const std::string& input_name = input->getName();
Expand Down Expand Up @@ -1558,26 +1544,6 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
}
trt_engine = trt_state->engine->get();
if (trt_state->engine_cache_enable) {
// Serialize engine profile
SerializeProfile(profile_cache_path, shape_ranges);
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;

// Serialize engine
nvinfer1::IHostMemory* serializedModel = trt_engine->serialize();
size_t engine_size = serializedModel->size();
if (trt_state->engine_decryption_enable) {
// Encrypt engine
if (!trt_state->engine_encryption(engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not call engine encryption function encrypt");
}
} else {
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
}
serializedModel->destroy();
}

// Build context
*(trt_state->context) = tensorrt_ptr::unique_pointer<nvinfer1::IExecutionContext>(
Expand All @@ -1586,6 +1552,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
}
trt_context = trt_state->context->get();

if (trt_state->engine_cache_enable)
trt_state->update_engine_cache = true;
trt_state->input_shape_ranges = shape_ranges;
}

// Get input and output binding names
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ struct TensorrtFuncState {
bool engine_decryption_enable;
int (*engine_decryption)(const char*, char*, size_t*);
int (*engine_encryption)(const char*, char*, size_t);
// If sub-graph has dynamic input shape and the shape range changes, or the first time writing out engine cache, this flag is set to true and engine cache will be saved. Otherwise the flag is false.
// Note: For dynamic input shape, if update_engine_cache flag is true, profile cache will be saved as well.
bool update_engine_cache;
};

// Logical device representation.
Expand Down
Loading

0 comments on commit 0292356

Please sign in to comment.