Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

裁剪transformer模型trt支持;修复tensorRT不支持DeletePass的bug #28517

Merged
merged 5 commits into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
"fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
FusePassBase::Init(name_scope_, graph);
int fusion_count = patterns::BuildFusion(graph, name_scope_);
if (fusion_count > 0) {
graph->Set(kEmbEltwiseLayernormPass, new bool(true));
}
AddStatis(fusion_count);
}

Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,11 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
platform::errors::Fatal(
"During the multiheadMatmul pass, The scope should not be null."));

patterns::BuildFusionV2(graph, name_scope_, scope);
int fusion_count = patterns::BuildFusionV2(graph, name_scope_, scope);
if (fusion_count > 0) {
graph->Set(kMultiheadMatmulPass, new bool(true));
}
AddStatis(fusion_count);
}

} // namespace ir
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/ir/pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ struct PassRegistrar;

typedef std::unordered_set<std::string> PassRecorder;
constexpr char kPassRecorder[] = "pass_recorder";
constexpr char kEmbEltwiseLayernormPass[] =
"embedding_eltwise_layernorm_fuse_pass_flag";
constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";

class Pass {
public:
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
fused_pattern);

// check if is in ernie or not
if (!graph->Has(kEmbEltwiseLayernormPass) ||
!graph->Has(kMultiheadMatmulPass)) {
LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
<< "Ernie/Bert model. Just skip this pass.";
return;
}

std::unordered_set<const Node *> del_node_set;

// Create an SkipLayerNorm op node
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ TEST(SkipLayerNormFusePass, basic) {
layers.layer_norm(elementwise_out, scale, bias);

std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
graph->Set(kEmbEltwiseLayernormPass, new bool(true));
graph->Set(kMultiheadMatmulPass, new bool(true));
auto pass = PassRegistry::Instance().Get("skip_layernorm_fuse_pass");
int num_nodes_before = graph->Nodes().size();
VLOG(3) << DebugString(graph);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,20 +117,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
block_desc.Proto()->set_idx(0);
LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes";

bool has_fused_embedding_eltwise_layernorm = false;
bool has_multihead_matmul = false;
for (auto *node : subgraph) {
auto *new_block_op = new_block->AppendOp();
auto *op = block_desc.AppendOp();
*new_block_op->Proto() = *node->Op()->Proto();
*op->Proto() = *node->Op()->Proto();
if (!has_fused_embedding_eltwise_layernorm
&& op->Type() == "fused_embedding_eltwise_layernorm") {
has_fused_embedding_eltwise_layernorm = true;
}
if (!has_multihead_matmul && op->Type() == "multihead_matmul") {
has_multihead_matmul = true;
}
}

// Then, we will use the input_names_with_id and output_names_with_id to
Expand Down Expand Up @@ -319,7 +310,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
disable_trt_plugin_fp16);
trt_engine->SetUseOSS(Get<bool>("use_oss"));
trt_engine->SetWithErnie(
has_multihead_matmul && has_fused_embedding_eltwise_layernorm);
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));

bool need_serialize = (use_static_engine && !load_from_memory);
if (need_serialize) {
Expand Down
12 changes: 8 additions & 4 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {

#undef CP_MEMBER

Update();
// Update();
// Update() will reset all the passes, when some tensorRT pass is deleted in
// other.pass_builder(), it will set again, so just copy the passes.
pass_builder_->ClearPasses();
for (const std::string &pass : other.pass_builder()->AllPasses()) {
pass_builder_->AppendPass(pass);
}
}

void AnalysisConfig::EnableCUDNN() {
Expand Down Expand Up @@ -281,9 +287,7 @@ void AnalysisConfig::SetTRTDynamicShapeInfo(
disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
}

void AnalysisConfig::EnableTensorRtOSS() {
trt_use_oss_ = true;
}
void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; }

// TODO(Superjomn) refactor this, buggy.
void AnalysisConfig::Update() {
Expand Down
10 changes: 5 additions & 5 deletions paddle/fluid/inference/tensorrt/convert/slice_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class SliceOpConverter : public OpConverter {

nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
if (engine_->use_oss() && engine_->with_ernie()) {
std::vector<nvinfer1::ITensor*> plugin_inputs;
// plugin_inputs.emplace_back(trans_layer->getOutput(0));
Expand All @@ -92,17 +93,16 @@ class SliceOpConverter : public OpConverter {
layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
plugin);
} else {
#if IS_TRT_VERSION_GE(6000)
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
plugin::SlicePluginDynamic* plugin =
new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
layer = engine_->AddPluginV2(&input, 1, plugin);
}
#else
PADDLE_THROW(platform::errors::Fatal(
"You are running the TRT Dynamic Shape mode, need to confirm that "
"your TRT version is no less than 6.0"));
PADDLE_THROW(platform::errors::Fatal(
"You are running the TRT Dynamic Shape mode, need to confirm that "
"your TRT version is no less than 6.0"));
#endif
}
} else {
bool ban_fp16 = engine_->disable_trt_plugin_fp16();
plugin::SlicePlugin* plugin =
Expand Down
9 changes: 9 additions & 0 deletions paddle/fluid/inference/tests/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)

set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
endif()

inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)

set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>

#include "paddle/fluid/inference/tests/api/trt_test_helper.h"

namespace paddle {
namespace inference {

void run(const AnalysisConfig& config, std::vector<float>* out_data) {
auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames();

int run_batch = 1;
const int run_seq_len = 128;

std::vector<int64_t> tmp_input;
std::vector<float> tmp_four_input;
tmp_input.reserve(run_batch * run_seq_len);
tmp_four_input.reserve(run_batch * run_seq_len);

int64_t i0[run_seq_len] = {
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321,
4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2,
75, 201, 340, 9, 14, 44, 486, 218, 1140, 279, 12043, 2};
int64_t i1[run_seq_len] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int64_t i2[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};

// first input
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({run_batch, run_seq_len, 1});
input_t->copy_from_cpu(i0);

// second input
auto input_t2 = predictor->GetInputTensor(input_names[1]);
input_t2->Reshape({run_batch, run_seq_len, 1});
input_t2->copy_from_cpu(i1);

// third input.
auto input_t3 = predictor->GetInputTensor(input_names[2]);
input_t3->Reshape({run_batch, run_seq_len, 1});
input_t3->copy_from_cpu(i2);

auto input_t4 = predictor->GetInputTensor(input_names[3]);
input_t4->Reshape({run_batch, run_seq_len, 1});
input_t4->copy_from_cpu(i3);

ASSERT_TRUE(predictor->ZeroCopyRun());

auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data->resize(out_num);
output_t->copy_to_cpu(out_data->data());
}

void trt_ernie(bool with_fp16, std::vector<float> result) {
AnalysisConfig config;
std::string model_dir = FLAGS_infer_model;
SetConfig(&config, model_dir, true);

config.SwitchUseFeedFetchOps(false);

int batch = 32;
int min_seq_len = 1;
int max_seq_len = 128;
int opt_seq_len = 128;

std::vector<int> min_shape = {1, min_seq_len, 1};
std::vector<int> max_shape = {batch, max_seq_len, 1};
std::vector<int> opt_shape = {batch, opt_seq_len, 1};
// Set the input's min, max, opt shape
std::map<std::string, std::vector<int>> min_input_shape = {
{"read_file_0.tmp_0", min_shape},
{"read_file_0.tmp_1", min_shape},
{"read_file_0.tmp_2", min_shape},
{"read_file_0.tmp_3", min_shape}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"read_file_0.tmp_0", max_shape},
{"read_file_0.tmp_1", max_shape},
{"read_file_0.tmp_2", max_shape},
{"read_file_0.tmp_3", max_shape}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"read_file_0.tmp_0", opt_shape},
{"read_file_0.tmp_1", opt_shape},
{"read_file_0.tmp_2", opt_shape},
{"read_file_0.tmp_3", opt_shape}};

auto precision = AnalysisConfig::Precision::kFloat32;
if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf;
}
config.EnableTensorRtEngine(1 << 30, 1, 12, precision, false, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
std::vector<float> out_data;
run(config, &out_data);

for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-5);
}
}

TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.498667, 0.501333};
trt_ernie(false, result);
}

} // namespace inference
} // namespace paddle
3 changes: 3 additions & 0 deletions paddle/fluid/operators/fused/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ register_operators(EXCLUDES
fusion_conv_inception_op
fused_fc_elementwise_layernorm_op
multihead_matmul_op
skip_layernorm_op
fused_embedding_eltwise_layernorm_op
fusion_group_op
fusion_gru_op
Expand Down Expand Up @@ -40,6 +41,8 @@ if (WITH_GPU)
# multihead_matmul_op
op_library(multihead_matmul_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(multihead_matmul);\n")
op_library(skip_layernorm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(skip_layernorm);\n")
op_library(fused_embedding_eltwise_layernorm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_embedding_eltwise_layernorm);\n")
# fusion_group
Expand Down
Loading