chenwhql · YuanRisheng · Sep 18, 2021 · Sep 19, 2021 · Sep 19, 2021 · Sep 20, 2021
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1265,6 +1265,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "hard_sigmoid") {
+      if (!with_dynamic_shape) {
+        auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block is null.";
+          return false;
+        }
+        auto x_var_name = desc.Input("X")[0];
+        auto* x_var_desc = block->FindVar(x_var_name);
+        const auto x_shape = x_var_desc->GetShape();
+        if (x_shape.size() <= 2) {
+          VLOG(3) << "hard_sigmoid op does not support input's dim less than 3 "
+                     "in tensorrt.";
+          return false;
+        }
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
 

diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
@@ -50,6 +50,15 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
       ctx->Attrs().Get<std::string>("padding_algorithm");
   int groups = ctx->Attrs().Get<int>("groups");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i], 0,
+        platform::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
+  }
   const std::string data_format = ctx->Attrs().Get<std::string>("data_format");
 
   // MKL-DNN Kernels are using NCHW order of dims description

diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -41,12 +41,16 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,

diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h
@@ -39,14 +39,14 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
 template <typename T>
 struct MaxGradDx {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x > y);
+    return dout * static_cast<T>(x > y);
   }
 };
 
 template <typename T>
 struct MaxGradDy {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return dout * (x <= y);
+    return dout * static_cast<T>(x <= y);
   }
 };
 

diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
@@ -15,11 +15,15 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/manipulation.h"
 namespace paddle {
 namespace operators {
 
@@ -29,10 +33,10 @@ class FlattenKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto *in = context.Input<framework::LoDTensor>("X");
     auto *out = context.Output<framework::LoDTensor>("Out");
-
     auto &axes = context.Attr<int>("axis");
     auto x_dims = in->dims();
-    auto out_dims = framework::make_ddim(GetOutputShape(axes, x_dims));
+    auto out_dims = framework::make_ddim(
+        FlattenKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
@@ -122,13 +126,18 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto *in = context.Input<framework::LoDTensor>("X");
     auto *out = context.Output<framework::LoDTensor>("Out");
-    auto out_dims = out->dims();
-
+    auto &dev_ctx = context.device_context<DeviceContext>();
     out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
+    auto pt_in = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
+                                                            in->type());
+    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, out->place(),
+                                                             out->type());
+
+    // call new kernel
+    pt::Flatten<T>(dev_ctx, *pt_in.get(), pt_out.get());
+
+    // share pt_out data to out
+    framework::ShareTensorImpl(pt_out.get(), out);
   }
 };