PaddlePaddle · YuanRisheng · Mar 10, 2025 · Feb 27, 2025 · Mar 3, 2025 · Mar 5, 2025
diff --git a/paddle/fluid/framework/new_executor/collect_shape_manager.cc b/paddle/fluid/framework/new_executor/collect_shape_manager.cc
@@ -28,23 +28,41 @@ void CollectShapeManager::CollectShapeInfo(
     framework::ValueExecutionInfo *value_exe_info,
     framework::Scope *scope) {
   std::lock_guard<std::mutex> lock(info_mutex_);
+  VLOG(3) << "collect shape in instruction:" << instr->Name();
   is_shape_range_info_ready_ = false;
   for (auto &input : instr->Inputs()) {
+    VLOG(3) << "input id:" << input.first.impl();
     if (!op_value2instr_id_.count(input.first)) {
       // Because the input value maybe same between different ops.
       // To prevent duplicate shape collection, we only select one op for
       // getting shape of value
       op_value2instr_id_[input.first] = instr->Id();
     }
     if (op_value2instr_id_[input.first] != instr->Id()) {
+      VLOG(3) << "input shape has been collected in same instruction, jump it, "
+                 "and input id:"
+              << input.first.impl();
       continue;
     }
     auto var_name = value_exe_info->GetVarName(input.first);
     auto *var = scope->FindVar(var_name);
-    if (!var || !var->IsType<phi::DenseTensor>()) continue;
+    if (!var || !var->IsType<phi::DenseTensor>()) {
+      VLOG(3) << "input var is null : " << (var == nullptr);
+      VLOG(3) << "input var is dense_tensor : "
+              << (var->IsType<phi::DenseTensor>());
+      VLOG(3) << "input is null or not dense_tensor, jump it, and input id:"
+              << input.first.impl();
+      continue;
+    }
 
     auto tensor = var->Get<phi::DenseTensor>();
     if (!tensor.initialized() && !instr->NoNeedBuffer().count(input.first)) {
+      VLOG(3) << "input tensor is initialized: " << (tensor.initialized());
+      VLOG(3) << "input tensor is no need buffer:"
+              << instr->NoNeedBuffer().count(input.first);
+      VLOG(3) << "input tensor is not initialized and not no need buffer, jump "
+                 "it, and input id:"
+              << input.first.impl();
       continue;
     }
     paddle::platform::DeviceContextPool &pool =

diff --git a/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc b/paddle/fluid/framework/new_executor/instruction/tensorrt_engine_instruction.cc
@@ -26,6 +26,7 @@ namespace paddle {
 namespace framework {
 
 using TensorRTEngine = paddle::platform::TensorRTEngine;
+static const int kMaxDim = 1000;
 
 TensorRTEngineInstruction::TensorRTEngineInstruction(
     size_t id,
@@ -621,7 +622,7 @@ void TensorRTEngineInstruction::BindOutputTensor(
       binding_offset;
 #endif
   std::vector<int> ddim;
-
+  phi::DenseTensor *fluid_t = nullptr;
 #if IS_TRT_VERSION_GE(8500)
   auto x_name = trt_engine_->engine()->getIOTensorName(bind_index);
   auto dims = trt_context->getTensorShape(x_name);
@@ -631,17 +632,36 @@ void TensorRTEngineInstruction::BindOutputTensor(
     if (dims.d[nb_dims - 1] != 1 || nb_dims == outputs_rank_[output_index])
       break;
   }
+  bool has_unknown_dim =
+      false;  // not dynamic shape, some shape is unknown before run trt engine.
   for (int i = 0; i < nb_dims; i++) {
-    ddim.push_back(dims.d[i]);
+    if (dims.d[i] == -1) {
+      has_unknown_dim = true;
+      ddim.push_back(kMaxDim);
+    } else {
+      ddim.push_back(dims.d[i]);
+    }
+  }
+
+  if (has_unknown_dim) {
+    const paddle::framework::Scope &scope = *(value_exec_info_->GetScope());
+    std::string tmp_output = output_name + "_tmp";
+    if (scope.FindVar(tmp_output) == nullptr) {
+      const_cast<framework::Scope *>(&scope)->Var(tmp_output);
+    }
+    fluid_t = scope.FindVar(tmp_output)->GetMutable<phi::DenseTensor>();
+  } else {
+    fluid_t = output_tensor;
   }
+
 #else
   PADDLE_THROW(
       common::errors::Unimplemented("PIR-TRT only support TensorRT "
                                     "version that is >= 8.5,"
                                     "Please check your TensorRT "
                                     "in your env."));
 #endif
-  auto *fluid_t = output_tensor;
+
   fluid_t->Resize(common::make_ddim(ddim));
   PADDLE_ENFORCE_LT(bind_index,
                     num_bindings,
@@ -734,11 +754,67 @@ void TensorRTEngineInstruction::RunTrt() {
   VLOG(4) << "Start running trt engine...";
   // Execute the engine.
   trt_engine_->Execute(runtime_batch, &buffers, stream);
+
   VLOG(4) << "End running trt engine and deal with output";
   for (const auto &index_name_pair : output_names_) {
     size_t i = index_name_pair.first;
     auto type = outputs_dtype_[i];
 
+#if IS_TRT_VERSION_GE(8500)
+    // deal with output that has unknown shape
+    std::string output_name = index_name_pair.second;
+    int bind_index = -1;
+    int binding_offset = 0;
+    binding_offset = trt_engine_->GetBindingsOffset();
+    for (int i = 0; i < trt_engine_->engine()->getNbIOTensors(); ++i) {
+      if (std::string(output_name.c_str()) ==
+          std::string(trt_engine_->engine()->getIOTensorName(i))) {
+        bind_index = i + binding_offset;
+        break;
+      }
+    }
+
+    auto trt_output_name = trt_engine_->engine()->getIOTensorName(bind_index);
+    auto trt_dims = trt_engine_->context()->getTensorShape(trt_output_name);
+    // find the tmp tensor(Allocated extra memory space for unknown dim) and
+    // copy its element to actual output tensor(Allocated appropriate memory
+    // space)
+    std::string tmp_output = output_name + "_tmp";
+    if (scope.FindVar(tmp_output) != nullptr) {
+      auto *output_tensor_tmp =
+          scope.FindVar(tmp_output)->GetMutable<phi::DenseTensor>();
+      auto *output_tensor = const_cast<phi::DenseTensor *>(
+          &(out_variable_array->at(i)->Get<phi::DenseTensor>()));
+      std::vector<int> ddim;
+      for (int i = 0; i < trt_dims.nbDims; i++) {
+        ddim.push_back(trt_dims.d[i]);
+      }
+      output_tensor->Resize(common::make_ddim(ddim));
+      dev_ctx_->Alloc(output_tensor, type);
+      if (type == phi::DataType::FLOAT32) {
+        auto *mutable_output = output_tensor->data<float>();
+        phi::memory_utils::Copy(phi::GPUPlace(),
+                                mutable_output,
+                                phi::GPUPlace(),
+                                output_tensor_tmp->data<float>(),
+                                sizeof(float) * output_tensor->numel(),
+                                nullptr);
+      } else if (type == phi::DataType::INT64 || type == phi::DataType::INT32) {
+        auto *mutable_output = output_tensor->data<int32_t>();
+        phi::memory_utils::Copy(phi::GPUPlace(),
+                                mutable_output,
+                                phi::GPUPlace(),
+                                output_tensor_tmp->data<int32_t>(),
+                                sizeof(int32_t) * output_tensor->numel(),
+                                nullptr);
+      } else {
+        PADDLE_THROW(common::errors::Unimplemented(
+            "Unsupported data type: %d when deal with output", type));
+      }
+    }
+#endif
+
+    // Type transformation for INT64 and FLOAT64
     if (type == phi::DataType::INT64) {
       auto y = index_name_pair.second;
       auto *fluid_v = out_variable_array->at(i);

diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -2412,11 +2412,10 @@ void HandleForTensorRTOp(
   std::vector<pir::Type> op_output_types;
 
   for (size_t i = 0; i < op_item->num_results(); ++i) {
-    phi::Place out_place = phi::TransToPhiPlace(kernel_key.backend());
     PushBackOutputTypes(ctx,
                         op_item,
                         op_item->result(i).type(),
-                        out_place,
+                        place,
                         kernel_key,
                         &op_output_types);
   }

diff --git a/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc b/paddle/fluid/pir/transforms/tensorrt/trt_op_marker_pass.cc
@@ -1023,7 +1023,7 @@ class SqueezeOpPattern
         int64_t s = input_var_name_shape[i];
         if (s == -1) {
           VLOG(3) << "The necessary attributes of the squeeze operator axis is "
-                     "missing. ss =====-1";
+                     "missing. ss == -1";
           return false;
         } else if (s == 1) {
           axes.push_back(s);
@@ -1035,6 +1035,18 @@ class SqueezeOpPattern
                    "missing.";
         return false;
       }
+    } else {
+      pir::Value x = op.operand_source(0);
+      auto x_shape = pir::GetShapeFromValue(x);
+      for (auto axis : axes) {
+        if (axis < 0) axis += x_shape.size();
+        if (x_shape[axis] != 1) {
+          VLOG(3) << "Cannot squeeze dimension " << axis << " with size "
+                  << x_shape[axis]
+                  << ". Only dimensions with size 1 can be squeezed.";
+          return false;
+        }
+      }
     }
 
     op->set_attribute(kCanRunTrtAttr, rewriter.bool_attr(true));

diff --git a/python/paddle/tensorrt/converter.py b/python/paddle/tensorrt/converter.py
@@ -385,25 +385,20 @@ def convert_subgraph_to_trt(self, program, group_op):
             if group_op.result(out_index).use_empty():
                 # if result value is not used, it doesn't need get shape, continue
                 continue
-            if not is_shape_tensor(result_value):
-                if len(result_value.shape) == 0:
-                    min_shape = []
-                    opt_shape = []
-                    max_shape = []
-                else:
-                    min_shape = get_value_shape_range_info(
-                        result_value, False, paddle.base.core.ShapeMode.kMIN
-                    )
-                    opt_shape = get_value_shape_range_info(
-                        result_value, False, paddle.base.core.ShapeMode.kOPT
-                    )
-                    max_shape = get_value_shape_range_info(
-                        result_value, False, paddle.base.core.ShapeMode.kMAX
-                    )
-            else:
-                min_shape = []
-                opt_shape = []
-                max_shape = []
+            min_shape = []
+            opt_shape = []
+            max_shape = []
+            if len(result_value.shape) != 0:
+                min_shape = get_value_shape_range_info(
+                    result_value, False, paddle.base.core.ShapeMode.kMIN
+                )
+                opt_shape = get_value_shape_range_info(
+                    result_value, False, paddle.base.core.ShapeMode.kOPT
+                )
+                max_shape = get_value_shape_range_info(
+                    result_value, False, paddle.base.core.ShapeMode.kMAX
+                )
+
             min_value = []
             opt_value = []
             max_value = []

diff --git a/python/paddle/tensorrt/impls/common.py b/python/paddle/tensorrt/impls/common.py
@@ -161,8 +161,8 @@ def bilinear_interp_converter(network, paddle_op, inputs):
     use_scales = True
     if outsize_tensor is not None:
         use_scales = False
-    elif out_h > 0 and out_w > 0 and scale_attr is not None:
-        use_scales = True
+    if outsize_tensor is None and len(scale_attr) == 0:
+        use_scales = False
 
     if use_scales:
         scale_h = -1.0
@@ -225,7 +225,55 @@ def bilinear_interp_converter(network, paddle_op, inputs):
             set_layer_name(output_size_tensor, paddle_op)
             output_size_tensor = output_size_tensor.get_output(0)
             resize_layer.set_input(1, output_size_tensor)
-
+        else:
+            if data_format == "NCHW":
+                shape_layer = network.add_shape(input_tensor)
+                shape_output = shape_layer.get_output(0)
+                # Get N and C from slice_layer output
+                slice_layer = network.add_slice(
+                    shape_output, start=[0], shape=[2], stride=[1]
+                )
+                # Create H and W
+                hw_constant = network.add_constant(
+                    shape=(2,),
+                    weights=trt.Weights(
+                        np.array([out_h, out_w], dtype=np.int32)
+                    ),
+                ).get_output(0)
+                # Create output shape(NCHW)
+                concat_layer = network.add_concatenation(
+                    [slice_layer.get_output(0), hw_constant]
+                )
+                concat_layer.axis = 0
+                resize_layer.set_input(1, concat_layer.get_output(0))
+            elif data_format == "NHWC":
+                shape_layer = network.add_shape(input_tensor)
+                shape_output = shape_layer.get_output(0)
+                # Get N and C from slice_layer output
+                n_layer = network.add_slice(
+                    shape_output, start=[0], shape=[1], stride=[1]
+                )
+                c_layer = network.add_slice(
+                    shape_output, start=[3], shape=[1], stride=[1]
+                )
+                # Create H and W
+                hw_constant = network.add_constant(
+                    shape=(2,),
+                    weights=trt.Weights(
+                        np.array([out_h, out_w], dtype=np.int32)
+                    ),
+                ).get_output(0)
+                # Create output shape(NHWC)
+                concat_layer = network.add_concatenation(
+                    [n_layer.get_output(0), hw_constant, c_layer.get_output(0)]
+                )
+                concat_layer.axis = 0
+                resize_layer.set_input(1, concat_layer.get_output(0))
+            else:
+                raise NotImplementedError(
+                    "Converter for bilinear_interp not support data_format {}.",
+                    data_format,
+                )
     return resize_layer.get_output(0)
 
 

diff --git a/python/paddle/tensorrt/impls/search.py b/python/paddle/tensorrt/impls/search.py
@@ -34,9 +34,16 @@ def non_zero_converter(network, paddle_op, inputs):
     input_tensor = inputs[0]
     cast_layer = network.add_cast(input_tensor, trt.float32)
     set_layer_name(cast_layer, paddle_op)
+
     non_zero_layer = network.add_non_zero(cast_layer.get_output(0))
+    nonzero_output = non_zero_layer.get_output(0)
     set_layer_name(non_zero_layer, paddle_op)
-    return non_zero_layer.get_output(0)
+
+    shuffle_layer = network.add_shuffle(input=nonzero_output)
+    shuffle_layer.first_transpose = (1, 0)
+    transposed_output = shuffle_layer.get_output(0)
+    set_layer_name(shuffle_layer, paddle_op)
+    return transposed_output
 
 
 @converter_registry.register("pd_op.argmax", trt_version="trt_version_ge=8.0")