diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc index c5aec4ee6883f..96de3d057d87d 100644 --- a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc +++ b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc @@ -55,6 +55,7 @@ DEFINE_GENERAL_PATTERN(Reshape, paddle::dialect::ReshapeOp) DEFINE_GENERAL_PATTERN(Dropout, paddle::dialect::DropoutOp) DEFINE_GENERAL_PATTERN(Bmm, paddle::dialect::BmmOp) DEFINE_GENERAL_PATTERN(Concat, paddle::dialect::ConcatOp) +DEFINE_GENERAL_PATTERN(Nonzero, paddle::dialect::NonzeroOp) DEFINE_GENERAL_PATTERN(Fused_gemm_epilogue, paddle::dialect::FusedGemmEpilogueOp) @@ -751,6 +752,7 @@ class SplitOpPattern : public pir::OpRewritePattern { output_lengths.push_back(attr.dyn_cast().data()); } axis += (axis < 0) ? x_shape.size() : 0; + if (x_shape[axis] == -1) { VLOG(3) << "The (" << axis << ") dim of input should not be -1"; return false; @@ -794,6 +796,7 @@ class TrtOpMarkerPass : public pir::PatternRewritePass { ADD_PATTERN(Conv2d) ADD_PATTERN(FusedConv2dAddAct) ADD_PATTERN(DepthwiseConv2d) + ADD_PATTERN(Nonzero) #undef ADD_PATTERN ps.Add(std::make_unique(context)); diff --git a/paddle/fluid/pir/transforms/trt_sub_graph_extract_pass.cc b/paddle/fluid/pir/transforms/trt_sub_graph_extract_pass.cc index 51bd5fe05f002..8fce769bd213d 100644 --- a/paddle/fluid/pir/transforms/trt_sub_graph_extract_pass.cc +++ b/paddle/fluid/pir/transforms/trt_sub_graph_extract_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,9 +20,9 @@ #include #include -#include "paddle/common/flags.h" #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h" #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h" +#include "paddle/common/flags.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/include/core/builder.h" @@ -41,7 +41,8 @@ using GroupOpsVec = std::vector; bool IsSupportedByTRT(const pir::Operation& op) { if (op.HasAttribute(paddle::dialect::kCanRunTrtAttr) && - op.attribute(paddle::dialect::kCanRunTrtAttr).data()) { + op.attribute(paddle::dialect::kCanRunTrtAttr) + .data()) { return true; } return false; @@ -49,8 +50,7 @@ bool IsSupportedByTRT(const pir::Operation& op) { class TrtSubGraphExtractPass : public pir::Pass { public: - TrtSubGraphExtractPass() - : pir::Pass("trt_sub_graph_extract_pass", 1) {} + TrtSubGraphExtractPass() : pir::Pass("trt_sub_graph_extract_pass", 1) {} void Run(pir::Operation* op) override { auto module_op = op->dyn_cast(); @@ -64,11 +64,13 @@ class TrtSubGraphExtractPass : public pir::Pass { ::pir::SubgraphDetector(&block, IsSupportedByTRT)(); AddStatistics(groups.size()); for (auto& group_ops : groups) { - if(group_ops.size() < FLAGS_trt_min_group_size) { - VLOG(4) << "current group_ops.size(): " << group_ops.size() << ", will fallback to paddle original graph"; + if (group_ops.size() < FLAGS_trt_min_group_size) { + VLOG(4) << "current group_ops.size(): " << group_ops.size() + << ", will fallback to paddle original graph"; continue; } - VLOG(4) << "current group_ops.size(): " << group_ops.size() << ", will lower to TensorRT graph"; + VLOG(4) << "current group_ops.size(): " << group_ops.size() + << ", will lower to TensorRT graph"; ::pir::ReplaceWithGroupOp(&block, group_ops); } } diff --git a/python/paddle/pp_tensorrt/impls/core.py b/python/paddle/pp_tensorrt/impls/core.py index 5b28748a8030e..a78691f40b57d 100644 --- a/python/paddle/pp_tensorrt/impls/core.py +++ b/python/paddle/pp_tensorrt/impls/core.py @@ -215,9 +215,6 @@ def layernorm_converter(network, paddle_op, inputs): f"{bias_tensor.name}_broadcast", len(input_a.shape) - len(bias_tensor.shape), ) - # _logger.info( - # f"!!! layernorm, {input_a.shape}, {scale_tensor.shape}, {bias_tensor.shape}" - # ) layer_norm = network.add_normalization( input_a, scale_tensor, bias_tensor, axes @@ -251,21 +248,25 @@ def conv2d_converter(network, paddle_op, inputs): return conv_layer + @converter_registry.register("pd_op.nonzero", trt_version="8.x") def non_zero_converter(network, paddle_op, inputs): input_tensor = inputs[0] cast_layer = network.add_cast(input_tensor, trt.float32) non_zero_layer = network.add_non_zero(cast_layer.get_output(0)) - + return non_zero_layer + @converter_registry.register("pd_op.gather_nd", trt_version="8.x") def gather_nd_converter(network, paddle_op, inputs): input_tensor, indices_tensor = inputs shuffle_layer = network.add_shuffle(indices_tensor) shuffle_layer.first_transpose = trt.Permutation([1, 0]) # import pdb;pdb.set_trace() - non_zero_layer = network.add_gather_v2(input_tensor, shuffle_layer.get_output(0), trt.GatherMode.ND) + non_zero_layer = network.add_gather_v2( + input_tensor, shuffle_layer.get_output(0), trt.GatherMode.ND + ) return non_zero_layer @@ -391,16 +392,6 @@ def batch_norm_converter(network, paddle_op, inputs): return batch_norm_layer -@converter_registry.register("pd_op.full") -def full_converter(network, paddle_op, inputs): - shape = paddle_op.attrs()["shape"] - value = paddle_op.attrs().get("value", 1.0) # 默认值为1.0 - full_tensor = network.add_constant( - shape, np.full(shape, value, dtype=np.float32) - ) - return full_tensor - - @converter_registry.register("pd_op.flatten", trt_version="8.x") def flatten_converter(network, paddle_op, inputs): input_val = inputs[0] @@ -483,3 +474,14 @@ def flatten_converter(network, paddle_op, inputs): flatten_layer.set_input(1, final_shape_layer.get_output(0)) return flatten_layer + + +@converter_registry.register("pd_op.concat") +def concat_converter(network, paddle_op, inputs): + input_tensor, axis = inputs + concat_layer = network.add_concatenation(inputs=input_tensor) + if axis < 0: + axis = len(input_tensor.shape) + axis + + concat_layer.axis = axis + return concat_layer diff --git a/python/paddle/pp_tensorrt/test_converter.py b/python/paddle/pp_tensorrt/test_converter.py index 3ffee8db6c040..73929b498adca 100644 --- a/python/paddle/pp_tensorrt/test_converter.py +++ b/python/paddle/pp_tensorrt/test_converter.py @@ -16,9 +16,11 @@ from converter import PaddleToTensorRTConverter from util import ( enforce_op_lower_trt, + forbid_op_lower_trt, get_bert_program, get_dummy_program, get_idg_program, + get_mlp_program, get_r50_program, predict_program, run_pir_pass, @@ -212,12 +214,6 @@ def test_paddle_to_tensorrt_conversion_idg(): # Step3: run pir pass(including some fusion pass and trt_op_marker_pass) program = run_pir_pass(program, partition_mode=False) - enforce_op_lower_trt(program, "pd_op.gather_nd") - enforce_op_lower_trt(program, "pd_op.nonzero") - # enforce_op_lower_trt(program, "pd_op.pool2d") - # enforce_op_lower_trt(program, "pd_op.batch_norm_") - # enforce_op_lower_trt(program, "pd_op.flatten") - # forbid_op_lower_trt(program, "pd_op.flatten") # Step4: run trt_sub_graph_extract_pass() program_with_pir = run_pir_pass(program, partition_mode=True) @@ -226,6 +222,7 @@ def test_paddle_to_tensorrt_conversion_idg(): converter = PaddleToTensorRTConverter(program_with_pir, scope) converter.convert_program_to_trt() + output_var = program_with_pir.list_vars()[-1] # Step6: run inference(converted_program) output_converted = predict_program( program_with_pir, @@ -249,8 +246,57 @@ def test_paddle_to_tensorrt_conversion_idg(): print("output_converted", output_converted) +def test_paddle_to_tensorrt_conversion_mlp(): + program, scope, param_dict = get_mlp_program() + input_data_min_shape = np.random.randn(1, 512, 1024).astype('float32') + input_data_max_shape = np.random.randn(2, 512, 1024).astype('float32') + + # Step1.1: get original results(for tests only) + output_var = program.list_vars()[-1] + output_expected = predict_program( + program, {"input": input_data_min_shape}, [output_var] + ) + + # Step2: run warmup for collecting shape + warmup_shape_infer( + program, + min_shape_feed={"input": input_data_min_shape}, + max_shape_feed={"input": input_data_max_shape}, + ) + + # Step3: run pir pass(including some fusion pass and trt_op_marker_pass) + program = run_pir_pass(program, partition_mode=False) + # forbid_op_lower_trt(program,"pd_op.concat") + + # Step4: run trt_sub_graph_extract_pass() + program_with_pir = run_pir_pass(program, partition_mode=True) + + # Step5: run TRTConverter(would lower group_op into tensorrt_engine_op) + converter = PaddleToTensorRTConverter(program_with_pir, scope) + converter.convert_program_to_trt() + + output_var = program_with_pir.list_vars()[-1] + # Step6: run inference(converted_program) + output_converted = predict_program( + program_with_pir, {"input": input_data_min_shape}, [output_var] + ) + + # Check that the results are close to each other within a tolerance of 1e-3 + np.testing.assert_allclose( + output_expected[0], + output_converted[0], + rtol=1e-3, + atol=1e-3, + err_msg="Outputs are not within the 1e-3 tolerance", + ) + + print("output_expected", output_expected) + print("output_converted", output_converted) + + if __name__ == "__main__": # test_paddle_to_tensorrt_conversion_dummy() # test_paddle_to_tensorrt_conversion_bert() # test_paddle_to_tensorrt_conversion_r50() - test_paddle_to_tensorrt_conversion_idg() + # test_paddle_to_tensorrt_conversion_idg() + test_paddle_to_tensorrt_conversion_mlp() diff --git a/python/paddle/pp_tensorrt/util.py b/python/paddle/pp_tensorrt/util.py index f527ab67bc463..a378319a3afc6 100644 --- a/python/paddle/pp_tensorrt/util.py +++ b/python/paddle/pp_tensorrt/util.py @@ -15,6 +15,7 @@ import numpy as np import paddle +import paddle.nn.functional as F try: import tensorrt as trt @@ -93,7 +94,6 @@ def run_pir_pass(program, partition_mode=False): return program - def forbid_op_lower_trt(program, op_name): for op in program.global_block().ops: if op.name() == op_name: @@ -234,7 +234,6 @@ def forward(self, input_ids): return encoded_output - def get_bert_program(): paddle.enable_static() @@ -303,40 +302,42 @@ def get_bert_program(): ) print(fetches) + class SimpleGatherNet(nn.Layer): def __init__(self): super(SimpleGatherNet, self).__init__() self.linear = paddle.nn.Linear(149600, 1) pass # self.fake_param = nn.Parameter(torch.tensor([1.])) - + def forward(self, map_vector_features, polyline_mask): map_vector_features = map_vector_features[polyline_mask] # num_element = map_vector_features.shape[0] - + # center_inds_sort = paddle.arange(num_element) - + # center_ind = int(num_points_per_element // 2) - + # center_coords = map_vector_features[:, center_ind, :2] # center_radius = paddle.norm(center_coords, axis=-1, p=2) - + # center_inds_sort = paddle.argsort(center_radius) # center_inds_sort = center_inds_sort[:num_max_element] - + # num_element = center_inds_sort.shape[0] - + # map_vector_features_out = paddle.zeros( # [num_max_element, 11, 17], dtype=paddle.float32) - + # print(center_inds_sort) # print(center_inds_sort.shape) # map_vector_features_out[:num_element] = map_vector_features[center_inds_sort] # map_vector_features_out = paddle.flatten(map_vector_features_out) # map_vector_features_out = self.linear(map_vector_features_out) - + return map_vector_features - + + def get_idg_program(): with paddle.pir_utils.IrGuard(): main_program = static.default_main_program() @@ -344,7 +345,9 @@ def get_idg_program(): with static.program_guard(main_program, startup_program): scope = paddle.static.global_scope() map_vector_features = static.data( - name='map_vector_features', shape=[1, 1400, 11, 17], dtype='float32' + name='map_vector_features', + shape=[1, 1400, 11, 17], + dtype='float32', ) polyline_mask = static.data( name='polyline_mask', shape=[1, 1400], dtype='bool' @@ -361,13 +364,20 @@ def get_idg_program(): with paddle.pir_utils.IrGuard(): with paddle.static.program_guard(pir_program, startup_program): - map_vector_features_data = np.random.rand(1, 1400, 11, 17).astype('float32') - polyline_mask_data = np.random.randint(0, 2, size=(1, 1400)).astype('bool') + map_vector_features_data = np.random.rand(1, 1400, 11, 17).astype( + 'float32' + ) + polyline_mask_data = np.random.randint(0, 2, size=(1, 1400)).astype( + 'bool' + ) executor = paddle.static.Executor(place) executor.run(startup_program) fetches = executor.run( pir_program, - feed={"map_vector_features": map_vector_features_data, "polyline_mask": polyline_mask_data}, + feed={ + "map_vector_features": map_vector_features_data, + "polyline_mask": polyline_mask_data, + }, fetch_list=pir_program.list_vars()[-1], ) params = main_program.global_block().all_parameters() @@ -376,4 +386,78 @@ def get_idg_program(): for v in params: name = v.get_defining_op().attrs()["parameter_name"] param_dict.update({name: np.array(scope.var(name).get_tensor())}) - return pir_program, scope, param_dict \ No newline at end of file + return pir_program, scope, param_dict + + +class MLPLayer(nn.Layer): + def __init__( + self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02, + ): + super().__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=initializer_range) + ) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr + ) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr + ) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + out = self.norm(input) + out1 = self.linear0(out) + out2 = F.gelu(out, approximate=True) + concat_out = paddle.concat([out1, out2], axis=-1) + # out = self.linear1(concat_out) + out = self.linear2(out) + + return out + + +def get_mlp_program(): + paddle.enable_static() + + hidden_size = 1024 + intermediate_size = 4 * 1024 + dropout_ratio = (0.1,) + initializer_range = 0.02 + + with paddle.pir_utils.IrGuard(): + infer_program = paddle.static.Program() + startup_program = paddle.static.Program() + with static.program_guard(infer_program, startup_program): + scope = paddle.static.global_scope() + input_ids = static.data( + name='input', + shape=[-1, 512, 1024], + dtype='float32', + ) + mlp_model = MLPLayer( + hidden_size, intermediate_size, dropout_ratio, initializer_range + ) + mlp_model.eval() + output = mlp_model(input_ids) + place = paddle.CUDAPlace(0) + exe = static.Executor(place) + exe.run(startup_program) + + # paddle.static.io.save(infer_program, "./resnet") + + params = infer_program.global_block().all_parameters() + param_dict = {} + for v in params: + name = v.get_defining_op().attrs()["parameter_name"] + param_dict.update({name: np.array(scope.var(name).get_tensor())}) + + return infer_program, scope, param_dict diff --git a/test/ir/pir/fused_pass/test_pir_trt_op_marker_pass.py b/test/ir/pir/fused_pass/test_pir_trt_op_marker_pass.py index d11b74f7dc702..a63bca2011c6b 100644 --- a/test/ir/pir/fused_pass/test_pir_trt_op_marker_pass.py +++ b/test/ir/pir/fused_pass/test_pir_trt_op_marker_pass.py @@ -515,5 +515,35 @@ def test_check_output(self): self.check_pass_correct() +class TestNonZeroTRTPattern(PassTest): + def is_program_valid(self, program=None): + return True + + def sample_program(self): + with paddle.pir_utils.IrGuard(): + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + with paddle.pir.core.program_guard(main_prog, start_prog): + x = paddle.static.data(name='x', shape=[4], dtype='float32') + out_z1_tuple = paddle.nonzero(x) + out = paddle.assign(out_z1_tuple) + self.pass_attr_list = [{'trt_op_marker_pass': {}}] + self.feeds = { + "x": np.array([0.0, 1.0, 0.0, 3.0]).astype("float32"), + } + self.fetch_list = [out] + self.valid_op_map = { + "pd_op.fusion_transpose_flatten_concat": 0, + } + yield [main_prog, start_prog], False + + def setUp(self): + if core.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def test_check_output(self): + self.check_pass_correct() + + if __name__ == "__main__": unittest.main()