microsoft
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc‎
Lines changed: 123 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h‎
Lines changed: 29 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc‎
Lines changed: 138 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/longformer_attention.h‎
Lines changed: 26 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/longformer_attention.h‎
Lines changed: 26 additions & 0 deletions
@@ -0,0 +1,123 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "longformer_attention_base.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+LongformerAttentionBase::LongformerAttentionBase(const OpKernelInfo& info) {
+  int64_t num_heads = 0;
+  ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
+  num_heads_ = static_cast<int>(num_heads);
+
+  int64_t window = 0;
+  ORT_ENFORCE(info.GetAttr("window", &window).IsOK() && window > 0);
+  window_ = static_cast<int>(window);
+}
+
+Status LongformerAttentionBase::CheckInputs(const TensorShape& input_shape,
+                                            const TensorShape& weights_shape,
+                                            const TensorShape& bias_shape,
+                                            const TensorShape& mask_shape,
+                                            const TensorShape& global_weights_shape,
+                                            const TensorShape& global_bias_shape,
+                                            const TensorShape& global_shape) const {
+  // Input shapes:
+  //   input           : (batch_size, sequence_length, hidden_size)
+  //   weights         : (hidden_size, 3 * hidden_size)
+  //   bias            : (3 * hidden_size)
+  //   mask            : (batch_size, sequence_length)
+  //   global_weights  : (hidden_size, 3 * hidden_size)
+  //   global_bias     : (3 * hidden_size)
+  //   global          : (batch_size, sequence_length)
+
+  const auto& dims = input_shape.GetDims();
+  if (dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ",
+                           dims.size());
+  }
+
+  int batch_size = static_cast<int>(dims[0]);
+  int sequence_length = static_cast<int>(dims[1]);
+  int hidden_size = static_cast<int>(dims[2]);
+  if (sequence_length % (2 * window_) != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'input' dimension 1 should be divisiable by 2W, where W is value of the window attribute.");
+  }
+  if (hidden_size % num_heads_ != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'input' dimension 2 should be divisiable by value of the num_heads attribute.");
+  }
+
+  const auto& weights_dims = weights_shape.GetDims();
+  if (weights_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' is expected to have 2 dimensions, got ",
+                           weights_dims.size());
+  }
+  if (weights_dims[0] != dims[2]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'weights' dimension 0 should have same length as dimension 2 of input 0");
+  }
+  if (weights_dims[1] != 3 * weights_dims[0]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' dimension 1 should be 3 times of dimension 0");
+  }
+
+  const auto& bias_dims = bias_shape.GetDims();
+  if (bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
+                           bias_dims.size());
+  }
+  if (bias_dims[0] != weights_dims[1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'");
+  }
+
+  const auto& mask_dims = mask_shape.GetDims();
+  if (mask_dims.size() == 2) {
+    if (static_cast<int>(mask_dims[0]) != batch_size || static_cast<int>(mask_dims[1]) != sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'mask' shall have shape batch_size x sequence_length");
+    }
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'mask' is expected to have 2 dimensions, got ",
+                           mask_dims.size());
+  }
+
+  const auto& global_weights_dims = global_weights_shape.GetDims();
+  if (global_weights_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_weights' is expected to have 2 dimensions, got ",
+                           weights_dims.size());
+  }
+  if (global_weights_dims[0] != dims[2]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'global_weights' dimension 0 should have same length as dimension 2 of input 0");
+  }
+  if (global_weights_dims[1] != 3 * global_weights_dims[0]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_weights' dimension 1 should be 3 times of dimension 0");
+  }
+
+  const auto& global_bias_dims = global_bias_shape.GetDims();
+  if (global_bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_bias' is expected to have 1 dimension, got ",
+                           global_bias_dims.size());
+  }
+  if (global_bias_dims[0] != global_weights_dims[1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'global_bias' dimension 0 should have same length as dimension 1 of input 'global_weights'");
+  }
+
+  const auto& global_dims = global_shape.GetDims();
+  if (global_dims.size() == 2) {
+    if (static_cast<int>(global_dims[0]) != batch_size || static_cast<int>(global_dims[1]) != sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'global' shall have shape batch_size x sequence_length");
+    }
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global' is expected to have 2 dimensions, got ",
+                           global_dims.size());
+  }
+
+  return Status::OK();
+}
+
+}  // namespace contrib
+}  // namespace onnxruntime
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+class LongformerAttentionBase {
+ protected:
+  LongformerAttentionBase(const OpKernelInfo& info);
+
+  Status CheckInputs(const TensorShape& input_shape,
+                     const TensorShape& weights_shape,
+                     const TensorShape& bias_shape,
+                     const TensorShape& mask_shape,
+                     const TensorShape& global_weights_shape,
+                     const TensorShape& global_bias_shape,
+                     const TensorShape& global_shape) const;
+
+  int num_heads_;  // Number of attention heads
+  int window_;     // Attention windows length (W). It is half (one-sided) of total window size.
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "longformer_attention.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
+#include "longformer_attention_impl.h"
+
+using namespace onnxruntime::cuda;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      LongformerAttention,                                        \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      LongformerAttention<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+
+template <typename T>
+LongformerAttention<T>::LongformerAttention(const OpKernelInfo& info) : CudaKernel(info), LongformerAttentionBase(info) {}
+
+template <typename T>
+Status LongformerAttention<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* weights = context->Input<Tensor>(1);
+  const Tensor* bias = context->Input<Tensor>(2);
+  const Tensor* mask = context->Input<Tensor>(3);
+  const Tensor* global_weights = context->Input<Tensor>(4);
+  const Tensor* global_bias = context->Input<Tensor>(5);
+  const Tensor* global_attention = context->Input<Tensor>(6);
+  ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(), weights->Shape(), bias->Shape(), mask->Shape(),
+                                  global_weights->Shape(), global_bias->Shape(), global_attention->Shape()));
+
+  // Input and output shapes:
+  //   Input 0 - input       : (batch_size, sequence_length, hidden_size)
+  //   Output 0 - output     : (batch_size, sequence_length, hidden_size)
+  const auto& shape = input->Shape();
+  int batch_size = static_cast<int>(shape[0]);
+  int sequence_length = static_cast<int>(shape[1]);
+  int hidden_size = static_cast<int>(shape[2]);
+  int head_size = hidden_size / num_heads_;
+
+  Tensor* output = context->Output(0, shape);
+
+  cublasHandle_t cublas = CublasHandle();
+  constexpr size_t element_size = sizeof(T);
+
+  // Use GEMM for fully connection.
+  int m = batch_size * sequence_length;
+  int n = 3 * hidden_size;
+  int k = hidden_size;
+
+  size_t qkv_size = batch_size * sequence_length * 3 * hidden_size * element_size;
+  auto gemm_buffer = GetScratchBuffer<T>(qkv_size);
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  CudaT one = ToCudaType<T>::FromFloat(1.0f);
+  CudaT zero = ToCudaType<T>::FromFloat(0.0f);
+
+  // Bias shape is (N), broadcast using B(N, M) = 1 * bias(N, 1) x ones(1, M) + 0 * B.
+  auto& device_prop = GetDeviceProp();
+  CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+      reinterpret_cast<const CudaT*>(bias->template Data<T>()), n,
+      GetConstOnes<CudaT>(m), 1,
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+
+  // Gemm, note that CUDA assumes col-major, so result(N, M) = 1 * weights x input + 1 x B.
+  CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+      reinterpret_cast<const CudaT*>(weights->template Data<T>()), n,
+      reinterpret_cast<const CudaT*>(input->template Data<T>()), k,
+      &one, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop));
+
+  // TODO: calculate the exact value from global flags.
+  int max_num_global = sequence_length;
+
+  // Fully connection for global projection.
+  // Note that Q only need handle global query tokens if we split GEMM to global Q/K/V separately.
+  // When there is no global token, need not run glboal GEMM.
+  auto global_gemm_buffer = GetScratchBuffer<T>(max_num_global > 0 ? qkv_size : 0);
+
+  if (max_num_global > 0) {
+    CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+        cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+        reinterpret_cast<const CudaT*>(global_bias->template Data<T>()), n,
+        GetConstOnes<CudaT>(m), 1,
+        &zero, reinterpret_cast<CudaT*>(global_gemm_buffer.get()), n, device_prop));
+
+    CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+        cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+        reinterpret_cast<const CudaT*>(global_weights->template Data<T>()), n,
+        reinterpret_cast<const CudaT*>(input->template Data<T>()), k,
+        &one, reinterpret_cast<CudaT*>(global_gemm_buffer.get()), n, device_prop));
+  }
+
+  size_t workSpaceSize = GetLongformerAttentionWorkspaceSize(element_size, batch_size, num_heads_, head_size, sequence_length, max_num_global);
+  auto workspace_buffer = GetScratchBuffer<void>(workSpaceSize);
+  if (!LaunchLongformerAttentionKernel(
+          device_prop,
+          reinterpret_cast<const CudaT*>(gemm_buffer.get()),
+          reinterpret_cast<const CudaT*>(mask->template Data<T>()),
+          reinterpret_cast<const CudaT*>(global_gemm_buffer.get()),
+          global_attention->template Data<int>(),
+          output->template MutableData<T>(),
+          batch_size,
+          sequence_length,
+          num_heads_,
+          head_size,
+          window_,
+          max_num_global,
+          workspace_buffer.get(),
+          cublas,
+          element_size)) {
+    // Get last error to reset it to cudaSuccess.
+    CUDA_CALL(cudaGetLastError());
+    return Status(common::ONNXRUNTIME, common::FAIL);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "contrib_ops/cpu/bert/longformer_attention_base.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class LongformerAttention final : public CudaKernel, public LongformerAttentionBase {
+ public:
+  LongformerAttention(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime