Skip to content
18 changes: 18 additions & 0 deletions paddle/fluid/inference/tensorrt/op_teller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
}
}

if (op_type == "hard_sigmoid") {
if (!with_dynamic_shape) {
auto* block = desc.Block();
if (block == nullptr) {
VLOG(3) << "The block is null.";
return false;
}
auto x_var_name = desc.Input("X")[0];
auto* x_var_desc = block->FindVar(x_var_name);
const auto x_shape = x_var_desc->GetShape();
if (x_shape.size() <= 2) {
VLOG(3) << "hard_sigmoid op does not support input's dim less than 3 "
"in tensorrt.";
return false;
}
}
}

if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
}

Expand Down
9 changes: 9 additions & 0 deletions paddle/fluid/operators/conv_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
ctx->Attrs().Get<std::string>("padding_algorithm");
int groups = ctx->Attrs().Get<int>("groups");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
int dilation_size = dilations.size();
for (int i = 0; i < dilation_size; ++i) {
PADDLE_ENFORCE_GT(
dilations[i], 0,
platform::errors::InvalidArgument(
"The dilation of Op(Conv) should be larget than 0, but received "
"dilation is %d.",
dilations[i]));
}
const std::string data_format = ctx->Attrs().Get<std::string>("data_format");

// MKL-DNN Kernels are using NCHW order of dims description
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/operators/elementwise/elementwise_max_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,16 @@ namespace ops = paddle::operators;

REGISTER_OP_CUDA_KERNEL(
elementwise_max,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(
elementwise_max_grad,
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/elementwise/elementwise_max_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
template <typename T>
struct MaxGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x > y);
return dout * static_cast<T>(x > y);
}
};

template <typename T>
struct MaxGradDy {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x <= y);
return dout * static_cast<T>(x <= y);
}
};

Expand Down
25 changes: 17 additions & 8 deletions paddle/fluid/operators/flatten_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tcmpt_utils.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/device_context.h"

// only can include the headers in paddle/tcmpt/api dirs
#include "paddle/tcmpt/api/include/dev/core.h"
#include "paddle/tcmpt/api/include/dev/manipulation.h"
namespace paddle {
namespace operators {

Expand All @@ -29,10 +33,10 @@ class FlattenKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &context) const override {
auto *in = context.Input<framework::LoDTensor>("X");
auto *out = context.Output<framework::LoDTensor>("Out");

auto &axes = context.Attr<int>("axis");
auto x_dims = in->dims();
auto out_dims = framework::make_ddim(GetOutputShape(axes, x_dims));
auto out_dims = framework::make_ddim(
FlattenKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));

out->mutable_data(context.GetPlace(), in->type());
framework::TensorCopy(
Expand Down Expand Up @@ -122,13 +126,18 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &context) const override {
auto *in = context.Input<framework::LoDTensor>("X");
auto *out = context.Output<framework::LoDTensor>("Out");
auto out_dims = out->dims();

auto &dev_ctx = context.device_context<DeviceContext>();
out->mutable_data(context.GetPlace(), in->type());
framework::TensorCopy(
*in, context.GetPlace(),
context.template device_context<platform::DeviceContext>(), out);
out->Resize(out_dims);
auto pt_in = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
in->type());
auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, out->place(),
out->type());

// call new kernel
pt::Flatten<T>(dev_ctx, *pt_in.get(), pt_out.get());

// share pt_out data to out
framework::ShareTensorImpl(pt_out.get(), out);
}
};

Expand Down
Loading