Skip to content

[Prim][PIR] fix bugs that not use full_scalar #67170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions paddle/fluid/primitive/composite/composite.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,10 @@ std::tuple<Tensor, Tensor> huber_loss_decomp(const Tensor& input,
}
auto val = label - input;
auto abs_val = abs<T>(val);
auto factor = full_scalar<T>(0.5, input.dtype());
auto ans = where<T>(abs_val <= delta_full,
0.5 * val * val,
delta_full * (abs_val - 0.5 * delta_full));
factor * val * val,
delta_full * (abs_val - factor * delta_full));
return std::make_tuple(ans, val);
}

Expand Down
169 changes: 111 additions & 58 deletions paddle/fluid/primitive/rule/vjp/details.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ void cumsum_grad(const Tensor& x,
Tensor* x_grad) {
if (x_grad) {
auto grad = cumsum<T>(out_grad, axis, flatten, exclusive, !reverse);
grad = reshape<T>(grad, x.shape());
if (has_dynamic_shape(x.shape())) {
grad = backend::reshape<T>(grad, shape<T>(x));
} else {
grad = reshape<T>(grad, x.shape());
}
set_output<T>(grad, x_grad);
}
}
Expand Down Expand Up @@ -146,8 +150,14 @@ void divide_grad(const Tensor& x,
template <typename T>
void floor_grad(const Tensor& out_grad, Tensor* x_grad) {
if (x_grad) {
auto zero_tensor =
full<T>(common::vectorize(out_grad.dims()), 0.0, out_grad.dtype());
Tensor zero_tensor;
if (has_dynamic_shape(out_grad.shape())) {
zero_tensor = backend::full_with_tensor<T>(
shape<T>(out_grad), 0.0, out_grad.dtype());
} else {
zero_tensor =
full<T>(common::vectorize(out_grad.dims()), 0.0, out_grad.dtype());
}
set_output<T>(zero_tensor, x_grad);
}
}
Expand Down Expand Up @@ -303,9 +313,12 @@ void gelu_grad(const Tensor& x,
if (approximate) {
float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5;
float kkappa = 0.044715;
Tensor kbeta_ = full_scalar<T>(kbeta, promoted_x.dtype());
Tensor kkappa_ = full_scalar<T>(kkappa, promoted_x.dtype());

auto x_sq = promoted_x * promoted_x;
auto x_cube = x_sq * promoted_x;
auto inner = kbeta * (promoted_x + kkappa * x_cube);
auto inner = kbeta_ * (promoted_x + kkappa_ * x_cube);
auto tanh_inner = tanh<T>(inner);

auto left = scale<T>(promoted_x, 0.5);
Expand All @@ -314,7 +327,7 @@ void gelu_grad(const Tensor& x,
auto left_derivative = scale<T>(right, 0.5);

auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
auto inner_derivative = kbeta * (scale<T>(3 * kkappa * x_sq, 1., 1.));
auto inner_derivative = kbeta_ * (scale<T>(3 * kkappa_ * x_sq, 1., 1.));
auto right_derivative = left * tanh_derivative * inner_derivative;

set_output<T>(
Expand All @@ -324,8 +337,11 @@ void gelu_grad(const Tensor& x,
} else {
float kalpha = M_SQRT1_2;
float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
auto cdf = scale<T>(scale<T>(erf<T>(kalpha * promoted_x), 1., 1.), 0.5);
auto pdf = kbeta * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
Tensor kalpha_ = full_scalar<T>(kalpha, promoted_x.dtype());
Tensor kbeta_ = full_scalar<T>(kbeta, promoted_x.dtype());

auto cdf = scale<T>(scale<T>(erf<T>(kalpha_ * promoted_x), 1., 1.), 0.5);
auto pdf = kbeta_ * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
set_output<T>(
cast<T>(promoted_out_grad * (cdf + promoted_x * pdf), x.type()),
x_grad);
Expand All @@ -336,9 +352,12 @@ void gelu_grad(const Tensor& x,
if (approximate) {
auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
auto kKappa = 0.044715;
Tensor kBeta_ = full_scalar<T>(kBeta, x.dtype());
Tensor kKappa_ = full_scalar<T>(kKappa, x.dtype());

auto x_sq = x * x;
auto x_cube = x_sq * x;
auto inner = kBeta * (x + kKappa * x_cube);
auto inner = kBeta_ * (x + kKappa_ * x_cube);
auto tanh_inner = tanh<T>(inner);

auto left = scale<T>(x, 0.5);
Expand All @@ -347,15 +366,18 @@ void gelu_grad(const Tensor& x,
auto left_derivative = scale<T>(right, 0.5);

auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
auto inner_derivative = kBeta_ * (scale<T>(3 * kKappa_ * x_sq, 1., 1.));
auto right_derivative = left * tanh_derivative * inner_derivative;

set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
} else {
auto kAlpha = M_SQRT1_2;
auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
Tensor kAlpha_ = full_scalar<T>(kAlpha, x.dtype());
Tensor kBeta_ = full_scalar<T>(kBeta, x.dtype());

auto cdf = scale<T>(scale<T>(erf<T>(kAlpha_ * x), 1., 1.), 0.5);
auto pdf = kBeta_ * exp<T>(scale<T>(x * x, -0.5));
set_output<T>(out_grad * (cdf + x * pdf), x_grad);
}
}
Expand Down Expand Up @@ -409,8 +431,13 @@ void reduce_as_grad(const Tensor& x,
template <typename T>
void reshape_grad(const Tensor& x, const Tensor& grad_out, Tensor* grad_x) {
if (grad_x) {
const auto& x_dims = x.dims();
auto grad_x_tmp = reshape<T>(grad_out, common::vectorize(x_dims));
Tensor grad_x_tmp;
if (has_dynamic_shape(x.shape())) {
grad_x_tmp = backend::reshape<T>(grad_out, shape<T>(x));
} else {
const auto& x_dims = x.dims();
grad_x_tmp = reshape<T>(grad_out, common::vectorize(x_dims));
}
set_output<T>(grad_x_tmp, grad_x);
}
}
Expand Down Expand Up @@ -503,7 +530,7 @@ void cos_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
template <typename T>
void tanh_grad(const Tensor& out, const Tensor& grad_out, Tensor* grad_x) {
if (!grad_x) return;
auto grad_x_tmp = grad_out * (1 - out * out);
auto grad_x_tmp = grad_out * (full_scalar<T>(1.0, out.dtype()) - out * out);
set_output<T>(grad_x_tmp, grad_x);
}

Expand Down Expand Up @@ -961,9 +988,8 @@ void dropout_grad(const Tensor& mask,
template <typename T>
void erf_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
if (x_grad) {
auto m_2_sqrt_pi =
full<T>(common::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
auto neg_one = full<T>(common::vectorize(x.dims()), -1.0, x.dtype());
auto m_2_sqrt_pi = full_scalar<T>(M_2_SQRTPI, x.dtype());
auto neg_one = full_scalar<T>(-1.0, x.dtype());
auto neg_tmp = neg_one * x * x;
auto mul_tmp = m_2_sqrt_pi * exp<T>(neg_tmp);
set_output<T>(out_grad * mul_tmp, x_grad);
Expand Down Expand Up @@ -1000,7 +1026,8 @@ void log_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
template <typename T>
void square_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
if (x_grad) {
Tensor x_grad_tmp = 2 * x * out_grad;
auto two = full_scalar<T>(2.0, x.dtype());
Tensor x_grad_tmp = two * x * out_grad;
set_output<T>(x_grad_tmp, x_grad);
}
}
Expand Down Expand Up @@ -1046,17 +1073,17 @@ void silu_grad(const Tensor& x,
const Tensor& out_grad,
Tensor* x_grad) {
if (x_grad) {
auto one = full_scalar<T>(1.0, x.dtype());
auto org_dtype = x.dtype();
bool need_cast = org_dtype == phi::DataType::FLOAT16 ||
org_dtype == phi::DataType::BFLOAT16;
if (need_cast) {
auto x_cast = cast<T>(x, phi::DataType::FLOAT32);
auto out_cast = cast<T>(out, phi::DataType::FLOAT32);
auto out_grad_cast = cast<T>(out_grad, phi::DataType::FLOAT32);
auto res = out_grad_cast * sigmoid<T>(x_cast) * (1.0 + x_cast - out_cast);
auto res = out_grad_cast * sigmoid<T>(x_cast) * (one + x_cast - out_cast);
set_output<T>(cast<T>(res, org_dtype), x_grad);
} else {
auto one = full_scalar<T>(1.0, x.dtype());
auto res = out_grad * sigmoid<T>(x) * (one + x - out);
set_output<T>(res, x_grad);
}
Expand Down Expand Up @@ -1243,29 +1270,39 @@ void maximum_grad(const Tensor& x,
if (x_grad) {
auto x_tmp = cast<T>(greater_than<T>(x, y), out_grad.dtype());
auto dx_res = out_grad * x_tmp;
if (out_grad.dims() != x.dims()) {
auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
auto dx_reduce_res =
dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
set_output<T>(dx_tmp, x_grad);
if (has_dynamic_shape(x.shape())) {
auto dx_reduce_res = reduce_as<T>(dx_res, x);
set_output<T>(dx_reduce_res, x_grad);
} else {
set_output<T>(dx_res, x_grad);
if (out_grad.dims() != x.dims()) {
auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
auto dx_reduce_res =
dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
set_output<T>(dx_tmp, x_grad);
} else {
set_output<T>(dx_res, x_grad);
}
}
}

if (y_grad) {
auto y_tmp = cast<T>(less_equal<T>(x, y), out_grad.dtype());
auto dy_res = out_grad * y_tmp;
if (out_grad.dims() != y.dims()) {
phi::DDim reduce_dim =
get_reduce_dims_from_out(out_grad.dims(), y.dims());
auto dy_reduce_res =
dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
set_output<T>(dy_tmp, y_grad);
if (has_dynamic_shape(y.shape())) {
auto dy_reduce_res = reduce_as<T>(dy_res, y);
set_output<T>(dy_reduce_res, y_grad);
} else {
set_output<T>(dy_res, y_grad);
if (out_grad.dims() != y.dims()) {
phi::DDim reduce_dim =
get_reduce_dims_from_out(out_grad.dims(), y.dims());
auto dy_reduce_res =
dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
set_output<T>(dy_tmp, y_grad);
} else {
set_output<T>(dy_res, y_grad);
}
}
}
}
Expand Down Expand Up @@ -1664,13 +1701,19 @@ void tile_grad(const Tensor& x,
template <typename T>
void hardswish_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
if (x_grad) {
auto offset = full<T>(common::vectorize(x.dims()), 3.0, x.dtype());
const Tensor offset = full_scalar<T>(3.0, x.dtype());
Tensor zero;
if (has_dynamic_shape(x.shape())) {
zero = backend::full_with_tensor<T>(shape<T>(x), 0.0, x.dtype());
} else {
zero = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
}
auto condition = less_equal<T>(x, offset);
auto tmp1 = where<T>(condition, out_grad * ((x / 3.0) + 0.5), out_grad);
auto res = where<T>(
less_than<T>(x, full<T>(common::vectorize(x.dims()), -3.0, x.dtype())),
full<T>(common::vectorize(x.dims()), 0.0, x.dtype()),
tmp1);
auto factor = full_scalar<T>(0.5, x.dtype());
auto tmp1 =
where<T>(condition, out_grad * ((x / offset) + factor), out_grad);
auto res =
where<T>(less_than<T>(x, full_scalar<T>(-3.0, x.dtype())), zero, tmp1);
set_output<T>(res, x_grad);
}
}
Expand All @@ -1681,8 +1724,8 @@ void leaky_relu_grad(const Tensor& out,
float negative_slope,
Tensor* x_grad) {
if (x_grad) {
auto condition = greater_than<T>(
out, full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
auto zero = full_scalar<T>(0.0, out.dtype());
auto condition = greater_than<T>(out, zero);
auto res = where<T>(condition, out_grad, out_grad * negative_slope);
set_output<T>(res, x_grad);
}
Expand Down Expand Up @@ -2015,29 +2058,39 @@ void minimum_grad(const Tensor& x,
if (x_grad) {
auto x_tmp = cast<T>(less_than<T>(x, y), out_grad.dtype());
auto dx_res = out_grad * x_tmp;
if (out_grad.dims() != x.dims()) {
auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
auto dx_reduce_res =
dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
set_output<T>(dx_tmp, x_grad);
if (has_dynamic_shape(x.shape())) {
auto dx_reduce_res = reduce_as<T>(dx_res, x);
set_output<T>(dx_reduce_res, x_grad);
} else {
set_output<T>(dx_res, x_grad);
if (out_grad.dims() != x.dims()) {
auto reduce_dim = get_reduce_dims_from_out(out_grad.dims(), x.dims());
auto dx_reduce_res =
dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
set_output<T>(dx_tmp, x_grad);
} else {
set_output<T>(dx_res, x_grad);
}
}
}

if (y_grad) {
auto y_tmp = cast<T>(greater_equal<T>(x, y), out_grad.dtype());
auto dy_res = out_grad * y_tmp;
if (out_grad.dims() != y.dims()) {
phi::DDim reduce_dim =
get_reduce_dims_from_out(out_grad.dims(), y.dims());
auto dy_reduce_res =
dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
set_output<T>(dy_tmp, y_grad);
if (has_dynamic_shape(y.shape())) {
auto dy_reduce_res = reduce_as<T>(dy_res, y);
set_output<T>(dy_reduce_res, y_grad);
} else {
set_output<T>(dy_res, y_grad);
if (out_grad.dims() != y.dims()) {
phi::DDim reduce_dim =
get_reduce_dims_from_out(out_grad.dims(), y.dims());
auto dy_reduce_res =
dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
set_output<T>(dy_tmp, y_grad);
} else {
set_output<T>(dy_res, y_grad);
}
}
}
}
Expand Down
11 changes: 10 additions & 1 deletion python/paddle/autograd/backward_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,21 @@
"pd_op.split",
"pd_op.multiply",
"pd_op.relu",
"pd_op.sigmoid",
"pd_op.divide",
"pd_op.pow",
"pd_op.elementwise_pow",
"pd_op.softmax",
"pd_op.matmul",
"pd_op.cumsum",
"pd_op.erf",
"pd_op.floor",
"pd_op.reshape",
"pd_op.leaky_relu",
"pd_op.softsign",
"pd_op.maximum",
"pd_op.minimum",
"pd_op.gelu",
"pd_op.hardswish",
"pd_op.reduce_as",
]

Expand Down
Loading