Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {

std::vector<int64_t> skip_layernorm_x_shape =
skip_layernorm_x->Var()->GetShape();
check_flag = true;
if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) {
check_flag = false;
VLOG(3) << "Transformer model remove_padding shape check failed, return "
Expand Down Expand Up @@ -395,6 +396,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);

std::vector<int64_t> fc_input_shape = fc_input->Var()->GetShape();
check_flag = true;
if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) ||
(fc_input_shape.size() != 3)) {
check_flag = false;
Expand Down Expand Up @@ -446,11 +448,13 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {

std::vector<int64_t> activation_input_shape =
activation_input->Var()->GetShape();
check_flag = true;
if ((activation_input_shape.size() !=
multihead_matmul_input_shape.size()) ||
(activation_input_shape.size() != 3)) {
check_flag = false;
VLOG(3) << "Transformer model remove_padding shape check failed, return "
VLOG(3) << "Activation: Transformer model remove_padding "
"shape(activation_input_shape.size()) check failed, return "
"remove_padding pass.";
return;
}
Expand All @@ -465,7 +469,8 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
check_flag = false;
}
if (!check_flag) {
VLOG(3) << "Transformer model remove_padding shape check failed, return "
VLOG(3) << "Activation: Transformer model remove_padding "
"shape(activation_input_shape[i]) check failed, return "
"remove_padding pass.";
return;
}
Expand Down Expand Up @@ -530,6 +535,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {

std::vector<int64_t> skip_layernorm_x_shape =
preln_skip_layernorm_x->Var()->GetShape();
check_flag = true;
if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) {
check_flag = false;
VLOG(3) << "Transformer model remove_padding shape check failed, return "
Expand Down
60 changes: 46 additions & 14 deletions paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,50 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
emb_names =
std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};

auto mask_id_tensor = engine_->GetITensor("mask_id");
auto mask_dims = mask_id_tensor->getDimensions();
auto slice_start_dims = mask_dims;
auto slice_stride_dims = mask_dims;

for (int i = 0; i < mask_dims.nbDims; i++) {
slice_start_dims.d[i] = 0;
slice_stride_dims.d[i] = 1;
}

auto* shape_tensor = Shape(mask_id_tensor);
std::vector<nvinfer1::ITensor*> size_vec_tensor;
for (int i = 0; i < mask_dims.nbDims; i++) {
size_vec_tensor.push_back(Add1DConstantLayer(1));
}
size_vec_tensor[1] = GetEleTensorOfShape(shape_tensor, 1);
auto size_tensor = Concat(size_vec_tensor);

auto slice_layer =
TRT_ENGINE_ADD_LAYER(engine_,
Slice,
*mask_id_tensor,
slice_start_dims,
slice_start_dims,
slice_stride_dims); // unuseful slice_start_dims
slice_layer->setInput(2, *size_tensor);
slice_layer->setName(
("Embeltwise_slice_layer (Output: slice_max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f);

auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
reshape_layer->setReshapeDimensions(shape_dim);
reshape_layer->setName(("Embeltwise_reshape_layer (Output: max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f);
engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0));
} else {
id_names = op_desc.Input("Ids");
emb_names = op_desc.Input("Embs");
Expand Down Expand Up @@ -192,20 +236,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
plugin_inputs.emplace_back(
engine_->GetITensor(pos_id_name)); // cu_seqlens,
// eval_placeholder_2
auto max_seqlen_tensor = engine_->GetITensor(mask_id_name);
auto* shuffle_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim);
shuffle_layer->setName(
("Embeltwise_Shuffle_reshape (Output: max_seqlen " +
op_desc.Output("Out")[0] + ")")
.c_str());
engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
plugin_inputs.emplace_back(engine_->GetITensor(
"max_seqlen_tensor")); // max_seqlen, eval_placeholder_3

auto creator = GetPluginRegistry()->getPluginCreator(
"CustomEmbLayerNormPluginDynamic", "2");
Expand Down
103 changes: 84 additions & 19 deletions paddle/fluid/inference/tensorrt/convert/fc_op.cc
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand Down Expand Up @@ -333,23 +330,91 @@ class FcOpConverter : public OpConverter {
if (!engine_->with_dynamic_shape()) {
x_num_col_dims--;
}
PADDLE_ENFORCE_GT(
x_dim.nbDims,
x_num_col_dims,
platform::errors::InvalidArgument(
"Params and input dims mismatch. Paddle-TRT FC "
"converter expects x_dim.nbDims > x_num_col_dims, but "
"x_dim.nbDims : %d, x_num_col_dims : %d.",
x_dim.nbDims,
x_num_col_dims));
// need reshape input before and after fc
auto* reshape_before_fc_layer =
reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
if (enable_int8 || support_int8) {
engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
// If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
// not add Shuffle layer in ernie's multihead.
if (x_dim.nbDims == 4 && x_num_col_dims == 1) {
if (enable_int8 || support_int8) {
// add conv1x1 layer
nvinfer1::DimsHW nv_ksize(1, 1);
auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*X,
n_output,
nv_ksize,
weight.get(),
bias.get());
if (activation_type == "relu") {
fc_layer_int8->setName(
("ernie_fc_op_int8: Convolution (Output: " + output_name + ")")
.c_str());
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("out_threshold"),
true,
platform::errors::InvalidArgument(
"must have out threshold in fc layers in int8 mode"));
float out_scale = 0;
if (enable_int8) {
out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
} else {
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
}
engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0),
out_scale);
nvinfer1::IActivationLayer* relu_layer_int8 =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_layer_int8->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_int8,
"relu_after_ernie_fc_int8",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(fc_layer_int8,
"ernie_fc_op_int8: Convolution",
{output_name},
test_mode);
}
} else {
// add fc layer
auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
engine_, FullyConnected, *X, n_output, weight.get(), bias.get());
if (activation_type == "relu") {
fc_layer_float->setName(
("ernie_fc_op_float: (Output: " + output_name + ")").c_str());
nvinfer1::IActivationLayer* relu_layer_float =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_layer_float->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_float,
"relu_after_ernie_fc_float",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(
fc_layer_float, "ernie_fc_op_float", {output_name}, test_mode);
}
}
} else { // need reshape input before and after fc
PADDLE_ENFORCE_GT(
x_dim.nbDims,
x_num_col_dims,
platform::errors::InvalidArgument(
"Params and input dims mismatch. Paddle-TRT FC "
"converter expects x_dim.nbDims > x_num_col_dims, but "
"x_dim.nbDims : %d, x_num_col_dims : %d.",
x_dim.nbDims,
x_num_col_dims));
auto* reshape_before_fc_layer =
reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
if (enable_int8 || support_int8) {
engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
}
regist_fc(reshape_itensor, n_output, weight, bias);
}
regist_fc(reshape_itensor, n_output, weight, bias);
}
};

Expand Down
51 changes: 37 additions & 14 deletions paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class FusedTokenPruneOpConverter : public OpConverter {
bool test_mode) override {
framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;

auto* Attn = engine_->GetITensor(op_desc.Input("Attn").front());
auto* X = engine_->GetITensor(op_desc.Input("X").front());
auto* Mask = engine_->GetITensor(op_desc.Input("Mask").front());
Expand All @@ -36,37 +35,61 @@ class FusedTokenPruneOpConverter : public OpConverter {
op_desc.HasAttr("keep_order")
? PADDLE_GET_CONST(bool, op_desc.GetAttr("keep_order"))
: false;

std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};

auto output_name = op_desc.Output("SlimmedX")[0];
auto out_inds_name = op_desc.Output("CLSInds")[0];
if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000)
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();

if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
with_fp16 = true;
}
bool flag_varseqlen = engine_->use_varseqlen();
plugin::FusedTokenPrunePluginDynamic* plugin =
new plugin::FusedTokenPrunePluginDynamic(
with_fp16, keep_first_token, keep_order);
layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
#else
PADDLE_THROW(platform::errors::Fatal(
"You are running the TRT Dynamic Shape mode, need to confirm that "
"your TRT version is no less than 6.0"));
#endif
with_fp16, keep_first_token, keep_order, flag_varseqlen);
if (flag_varseqlen) {
auto* word_id = engine_->GetITensor("word_id");
auto* pos_id = engine_->GetITensor("pos_id");
auto* mask_id = engine_->GetITensor("mask_id");
std::vector<nvinfer1::ITensor*> itensors = {
Attn, X, Mask, NewMask, word_id, pos_id, mask_id};
layer = engine_->AddDynamicPlugin(itensors.data(), 7, plugin);

layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));

layer->getOutput(1)->setName(out_inds_name.c_str());
engine_->SetITensor(out_inds_name, layer->getOutput(1));

engine_->DeleteITensor("word_id", word_id);
layer->getOutput(2)->setName("word_id_after_token_prune");
engine_->SetITensor("word_id", layer->getOutput(2));

engine_->DeleteITensor("pos_id", pos_id);
layer->getOutput(3)->setName("pos_id_after_token_prune");
engine_->SetITensor("pos_id", layer->getOutput(3));

engine_->DeleteITensor("mask_id", mask_id);
layer->getOutput(4)->setName("mask_id_after_token_prune");
engine_->SetITensor("mask_id", layer->getOutput(4));
} else {
std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};
layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
layer->getOutput(1)->setName(out_inds_name.c_str());
engine_->SetITensor(out_inds_name, layer->getOutput(1));
}
layer->setName(
("fused_token_prune(Output: " + output_name + ")").c_str());
} else {
PADDLE_THROW(platform::errors::Fatal(
"You are running the Ernie(Bert) model in static shape mode, which "
"is not supported for the time being.\n"
"You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
"the shape information to run the dynamic shape mode."));
}
RreplenishLayerAndOutput(
layer, "fused_token_prune", {output_name, out_inds_name}, test_mode);
}
};

Expand Down
43 changes: 6 additions & 37 deletions paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())};
auto max_seqlen_tensor = engine_->GetITensor("max_seqlen_tensor");
auto pos_id_tensor = engine_->GetITensor("pos_id");
if (engine_->with_interleaved()) {
VLOG(4) << "fused multihead_matmul op: use_varseqlen and "
"with_interleaved";
Expand Down Expand Up @@ -154,31 +156,9 @@ class MultiheadMatMulOpConverter : public OpConverter {

std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.emplace_back(fc_layer->getOutput(0));
if (engine_->Has("ernie_pos_name")) {
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->Get<std::string>("ernie_pos_name")));
} else {
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()
->getInput(2)
->getName())); // cu_seqlens, eval_placeholder_2
}
auto max_seqlen_tensor =
engine_->GetITensor(engine_->network()->getInput(3)->getName());
engine_->SetTensorDynamicRange(max_seqlen_tensor, 1.0f);
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_,
Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim);
engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
plugin_inputs.emplace_back(pos_id_tensor);
plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
shuffle_layer->setName(
("Multihead: Shuffle: (Output: " + output_name + ")").c_str());
max_seqlen_tensor); // max_seqlen, eval_placeholder_3
auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
layer = plugin_layer;
Expand Down Expand Up @@ -299,20 +279,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.emplace_back(fc_layer->getOutput(0));
plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask"));
plugin_inputs.emplace_back(engine_->GetITensor("pos_id"));

auto max_seqlen_tensor = engine_->GetITensor("mask_id");
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_,
Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1;
shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim);
engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
plugin_inputs.emplace_back(pos_id_tensor);
plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
max_seqlen_tensor); // max_seqlen, eval_placeholder_3

auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin);
Expand Down
Loading