smallv0221
diff --git a/‎paddle/fluid/operators/bincount_op.cc
+116 b/‎paddle/fluid/operators/bincount_op.cc
+116
diff --git a/‎paddle/fluid/operators/bincount_op.cu
+160 b/‎paddle/fluid/operators/bincount_op.cu
+160
diff --git a/‎paddle/fluid/operators/bincount_op.h
+109 b/‎paddle/fluid/operators/bincount_op.h
+109
diff --git a/‎paddle/fluid/pybind/op_function_generator.cc
+4 b/‎paddle/fluid/pybind/op_function_generator.cc
+4
@@ -0,0 +1,116 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/bincount_op.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class BincountOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of BincountOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of BincountOp should not be null."));
+
+    auto input_dim = ctx->GetInputDim("X");
+    auto minlength = ctx->Attrs().Get<int>("minlength");
+
+    PADDLE_ENFORCE_GE(minlength, 0,
+                      platform::errors::InvalidArgument(
+                          "The minlength should be greater than or equal to 0."
+                          "But received minlength is %d",
+                          minlength));
+
+    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The 'shape' of Input(X) must be 1-D tensor."
+                          "But the dimension of Input(X) is [%d]",
+                          input_dim.size()));
+
+    if (ctx->HasInput("Weights")) {
+      auto weights_dim = ctx->GetInputDim("Weights");
+      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The 'shape' of Input(Weights) must be 1-D tensor."
+                            "But the dimension of Input(Weights) is [%d]",
+                            weights_dim.size()));
+
+      PADDLE_ENFORCE_EQ(
+          weights_dim[0], input_dim[0],
+          platform::errors::InvalidArgument(
+              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+              "Input(X)."
+              "But received: the 'shape' of Input(Weights) is [%s],"
+              "the 'shape' of Input(X) is [%s]",
+              weights_dim, input_dim));
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim({-1}));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const {
+    auto data_type =
+        ctx.HasInput("Weights")
+            ? OperatorWithKernel::IndicateVarDataType(ctx, "Weights")
+            : OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensor of Bincount op,");
+    AddInput("Weights", "(Tensor) The weights tensor of Bincount op,")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor) The output tensor of Bincount op,");
+    AddAttr<int>("minlength", "(int) The minimal numbers of bins")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(R"DOC(
+          Bincount Operator.
+          Computes frequency of each value in the input tensor.
+          Elements of input tensor should be non-negative ints.
+      )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    bincount, ops::BincountOp, ops::BincountOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -0,0 +1,160 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/bincount_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input, const int total_elements,
+                               const bool has_weights, const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename InputT>
+void BincountCUDAInner(const framework::ExecutionContext& context) {
+  const Tensor* input = context.Input<framework::Tensor>("X");
+  const Tensor* weights = context.Input<framework::Tensor>("Weights");
+  Tensor* output = context.Output<framework::Tensor>("Out");
+  auto& minlength = context.Attr<int>("minlength");
+
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    framework::DDim out_dim{0};
+    output->Resize(out_dim);
+    output->mutable_data<T>(context.GetPlace());
+    return;
+  }
+  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
+
+  framework::Tensor input_min_t, input_max_t;
+  auto* input_max_data =
+      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
+  auto* input_min_data =
+      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
+
+  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = context.template device_context<DeviceContext>().eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  Tensor input_min_cpu, input_max_cpu;
+  TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
+  TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min, static_cast<InputT>(0),
+      platform::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  framework::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = (weights != nullptr);
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+
+  auto stream =
+      context.template device_context<platform::CUDADeviceContext>().stream();
+
+  if (!has_weights) {
+    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        context.template device_context<DeviceContext>(), output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type = weights->type();
+
+    if (weights_type == framework::proto::VarType::FP32) {
+      float* output_data = output->mutable_data<float>(context.GetPlace());
+      math::SetConstant<DeviceContext, float>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = output->mutable_data<double>(context.GetPlace());
+      math::SetConstant<DeviceContext, double>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<double>(0));
+
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BincountCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<framework::Tensor>("X");
+    const auto& input_type = input->type();
+
+    if (input_type == framework::proto::VarType::INT32) {
+      BincountCUDAInner<DeviceContext, T, int>(context);
+    } else if (input_type == framework::proto::VarType::INT64) {
+      BincountCUDAInner<DeviceContext, T, int64_t>(context);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
@@ -0,0 +1,109 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T, typename InputT>
+void BincountInner(const framework::ExecutionContext& context) {
+  const Tensor* input = context.Input<framework::Tensor>("X");
+  const Tensor* weights = context.Input<framework::Tensor>("Weights");
+  Tensor* output = context.Output<framework::Tensor>("Out");
+  auto& minlength = context.Attr<int>("minlength");
+
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    framework::DDim out_dim{0};
+    output->Resize(out_dim);
+    output->mutable_data<InputT>(context.GetPlace());
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      platform::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  framework::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = (weights != nullptr);
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    const auto& weights_type = weights->type();
+    if (weights_type == framework::proto::VarType::FP32) {
+      float* output_data = output->mutable_data<float>(context.GetPlace());
+      math::SetConstant<DeviceContext, float>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = output->mutable_data<double>(context.GetPlace());
+      math::SetConstant<DeviceContext, double>()(
+          context.template device_context<DeviceContext>(), output,
+          static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        context.template device_context<DeviceContext>(), output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BincountKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<framework::Tensor>("X");
+    const auto& input_type = input->type();
+
+    if (input_type == framework::proto::VarType::INT32) {
+      BincountInner<DeviceContext, T, int>(context);
+    } else if (input_type == framework::proto::VarType::INT64) {
+      BincountInner<DeviceContext, T, int64_t>(context);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
@@ -40,6 +40,10 @@
 // need to manually specify them in this map.
 std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
+    {"bincount", {"X", "Weights"}},
+    {"fused_attention",
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
+      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},