add argsort/scatter for kunlun (PaddlePaddle#38345)

* add argsort/scatter for kunlun * update test_scatter * update xpu.cmake * update xpu.cmake * fix scatter
Layssy · Dec 29, 2021 · 4643baa · 4643baa
1 parent 3672480
commit 4643baa
Show file tree

Hide file tree

Showing 6 changed files with 733 additions and 1 deletion.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211226")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/argsort_op.h"
+
+namespace paddle {
+namespace operators {
+
+const int XPU_SORT_MAX_SIZE = 16384;
+
+template <typename T, typename TID>
+static inline void xpu_argsort(xpu::Context* ctx, const T* input_data,
+                               T* output_data, TID* indices_data, int m, int n,
+                               bool descending) {
+  int ret =
+      xpu::sort(ctx, input_data, output_data, indices_data, m, n, descending);
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External("XPU sort kernel return wrong value[%d %s].",
+                                 ret, XPUAPIErrorMsg[ret]));
+}
+
+template <typename T>
+static inline void xpu_transpose(xpu::Context* ctx, const T* x, T* y,
+                                 const std::vector<int>& xshape,
+                                 const std::vector<int>& permute) {
+  int ret = xpu::transpose(ctx, x, y, xshape, permute);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU transpose kernel return wrong value[%d %s]", ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+template <typename TX, typename TY>
+static inline void xpu_cast(xpu::Context* ctx, const TX* x, TY* y, int len) {
+  int ret = xpu::cast_v2(ctx, x, y, len);
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External("XPU cast kernel return wrong value[%d %s]",
+                                 ret, XPUAPIErrorMsg[ret]));
+}
+
+template <typename T, bool VALUE_NEED_CAST = false,
+          bool INDEX_NEED_CAST = false>
+struct XPUArgsort {
+  void operator()(xpu::Context* ctx, const T* input_data, T* output_data,
+                  int64_t* indices_data, const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    T* input_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    T* output_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    int64_t* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_transpose(ctx, input_data, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+    xpu_transpose(ctx, output_data_trans, output_data, trans_data_shape,
+                  permute);
+    xpu_transpose(ctx, indices_data_trans, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <typename T>
+struct XPUArgsort<T, false, true> {
+  void operator()(xpu::Context* ctx, const T* input_data, T* output_data,
+                  int64_t* indices_data, const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    T* input_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    T* output_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    int* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int64_t* cast_data_int64 = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_transpose(ctx, input_data, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+    xpu_transpose(ctx, output_data_trans, output_data, trans_data_shape,
+                  permute);
+    xpu_cast(ctx, indices_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <>
+struct XPUArgsort<int64_t, true, true> {
+  void operator()(xpu::Context* ctx, const int64_t* input_data,
+                  int64_t* output_data, int64_t* indices_data,
+                  const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    int* input_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* output_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* cast_data_int = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int64_t* cast_data_int64 = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_cast(ctx, input_data, cast_data_int, len);
+    xpu_transpose(ctx, cast_data_int, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+
+    xpu_cast(ctx, output_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, output_data, trans_data_shape, permute);
+    xpu_cast(ctx, indices_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <typename T>
+class ArgsortXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    int n = in_dims[axis];
+
+    PADDLE_ENFORCE_LT(
+        n, XPU_SORT_MAX_SIZE,
+        platform::errors::InvalidArgument(
+            "The axis dimension of Input should less than %d, but got %d.",
+            XPU_SORT_MAX_SIZE, in_dims[axis]));
+
+    auto input_data = input->data<T>();
+    auto output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int len_before =
+        framework::product(framework::slice_ddim(in_dims, 0, axis));
+    int len_after = framework::product(
+        framework::slice_ddim(in_dims, axis + 1, in_dims.size()));
+    bool int64_need_cast =
+        (std::is_same<T, int64_t>::value && n > (XPU_SORT_MAX_SIZE / 2))
+            ? true
+            : false;
+    bool index_need_cast = (n > (XPU_SORT_MAX_SIZE / 2)) ? true : false;
+    std::vector<int> permute_vec{0, 2, 1};
+    std::vector<int> data_shape{len_before, n, len_after};
+
+    if (int64_need_cast) {
+      XPUArgsort<T, true, true>()(dev_ctx.x_context(), input_data, output_data,
+                                  indices_data, data_shape, permute_vec,
+                                  descending);
+    } else if (index_need_cast) {
+      XPUArgsort<T, false, true>()(dev_ctx.x_context(), input_data, output_data,
+                                   indices_data, data_shape, permute_vec,
+                                   descending);
+    } else {
+      XPUArgsort<T, false, false>()(dev_ctx.x_context(), input_data,
+                                    output_data, indices_data, data_shape,
+                                    permute_vec, descending);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(argsort, ops::ArgsortXPUKernel<float>,
+                       ops::ArgsortXPUKernel<int>,
+                       ops::ArgsortXPUKernel<int64_t>);
+
+#endif
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ScatterOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Ids");
+    auto *updates = ctx.Input<Tensor>("Updates");
+    auto *out = ctx.Output<Tensor>("Out");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+
+    // In place output: Out = X, Out[ids] = Updates
+    framework::TensorCopy(*x, ctx.GetPlace(), out);
+    // Apply ScatterUpdate: Out[index] = Updates[:]
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s].",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    // check index of shape 1-D
+    PADDLE_ENFORCE_EQ(
+        index->dims().size() == 1 ||
+            (index->dims().size() == 2 && index->dims()[1] == 1),
+        true, platform::errors::InvalidArgument(
+                  "index's shape is error, "
+                  "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+                  "but got index'dims shape is %d",
+                  index->dims().size()));
+
+    int index_size = static_cast<int>(index->dims()[0]);
+    auto x_dims = x->dims();
+    auto update_dims = updates->dims();
+    for (int i = 1; i < x_dims.size(); i++)
+      PADDLE_ENFORCE_EQ(
+          x_dims[i], update_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimensions of the source tensor and target tensor should"
+              " match, but received source tensor's %d-th dimension is %d,"
+              "target tensor's %d-th dimension is %d.",
+              i, x_dims[i], i, update_dims[i]));
+
+    int dim0 = static_cast<int>(x->dims()[0]);
+    int dim1 = static_cast<int>(
+        framework::product(framework::slice_ddim(x_dims, 1, x_dims.size())));
+    T *out_data = out->data<T>();
+    const T *updates_data = updates->data<T>();
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int r = XPU_SUCCESS;
+
+    Tensor indices_cpu(index->type());
+    framework::TensorCopy(*index, platform::CPUPlace(), &indices_cpu);
+
+    if (index_type == framework::proto::VarType::INT32) {
+      auto index_data = const_cast<int *>(index->data<int>());
+      xpu::VectorParam<int> indices{indices_cpu.data<int>(), index_size,
+                                    index_data};
+      r = xpu::scatter(dev_ctx.x_context(), updates_data, out_data, indices,
+                       dim0, dim1, overwrite);
+    } else {
+      auto index_data = const_cast<int64_t *>(index->data<int64_t>());
+      xpu::VectorParam<int64_t> indices{indices_cpu.data<int64_t>(), index_size,
+                                        index_data};
+      r = xpu::scatter(dev_ctx.x_context(), updates_data, out_data, indices,
+                       dim0, dim1, overwrite);
+    }
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU scatter kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(scatter, ops::ScatterOpXPUKernel<float>,
+                       ops::ScatterOpXPUKernel<int64_t>);
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -32,6 +32,9 @@ XPUOpMap& get_kl2_ops() {
       {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"argsort", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"assign_value",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
@@ -263,6 +266,8 @@ XPUOpMap& get_kl2_ops() {
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::FP16, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),