[深度对齐] dot (#75717)

cszdrg · web-flow · commit a02d1aa31fe3 · 2025-10-13T13:59:00.000+08:00
* fix

* fix

* fix dcu
diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
@@ -106,7 +106,14 @@ extern void *cublas_dso_handle;
   __macro(cublasCmatinvBatched);          \
   __macro(cublasZmatinvBatched);          \
   __macro(cublasSgetrsBatched);           \
-  __macro(cublasDgetrsBatched);
+  __macro(cublasDgetrsBatched);           \
+  __macro(cublasSdot_v2);                 \
+  __macro(cublasDdot_v2);                 \
+  __macro(cublasCdotc_v2);                \
+  __macro(cublasZdotc_v2);                \
+  __macro(cublasCdotu_v2);                \
+  __macro(cublasZdotu_v2);                \
+  __macro(cublasDotEx);
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
@@ -283,6 +283,10 @@ class Blas {
   template <typename T>
   T DOT(int n, const T* x, const T* y) const;
 
+  template <typename T>
+  void CUDOT(
+      int n, const T* x, int incx, const T* y, int incy, T* result) const;
+
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
@@ -543,6 +547,11 @@ class BlasT : private Blas<DeviceContext> {
     return Base()->template DOT<T>(args...);
   }
 
+  template <typename... ARGS>
+  void CUDOT(ARGS... args) const {
+    Base()->template CUDOT<T>(args...);
+  }
+
   template <typename... ARGS>
   void SCAL(ARGS... args) const {
     Base()->template SCAL<T>(args...);
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -211,6 +211,11 @@ struct CUBlas<float> {
   static void TRSM_BATCH(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsmBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void DOT(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSdot_v2(args...));
+  }
 };
 
 template <>
@@ -302,6 +307,11 @@ struct CUBlas<double> {
   static void TRSM_BATCH(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsmBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void DOT(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDdot_v2(args...));
+  }
 };
 
 template <>
@@ -559,6 +569,26 @@ struct CUBlas<phi::float16> {
         "cublasGemmEx_64 is not supported on cuda < 12.3"));
 #endif
   }
+
+  static void DOT(cublasHandle_t handle,
+                  int n,
+                  const phi::float16 *x,
+                  const int incx,
+                  const phi::float16 *y,
+                  const int incy,
+                  phi::float16 *result) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDotEx(handle,
+                                                         n,
+                                                         x,
+                                                         CUDA_R_16F,
+                                                         incx,
+                                                         y,
+                                                         CUDA_R_16F,
+                                                         incy,
+                                                         result,
+                                                         CUDA_R_16F,
+                                                         CUDA_R_32F));
+  }
 };
 
 template <>
@@ -908,6 +938,23 @@ struct CUBlas<phi::complex64> {
         info,
         batch_size));
   }
+
+  static void DOT(cublasHandle_t handle,
+                  int n,
+                  const phi::complex64 *x,
+                  const int incx,
+                  const phi::complex64 *y,
+                  const int incy,
+                  phi::complex64 *result) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCdotu_v2(
+        handle,
+        n,
+        reinterpret_cast<const cuFloatComplex *>(x),
+        incx,
+        reinterpret_cast<const cuFloatComplex *>(y),
+        incy,
+        reinterpret_cast<cuFloatComplex *>(result)));
+  }
 };
 
 template <>
@@ -1257,6 +1304,23 @@ struct CUBlas<phi::complex128> {
         info,
         batch_size));
   }
+
+  static void DOT(cublasHandle_t handle,
+                  int n,
+                  const phi::complex128 *x,
+                  const int incx,
+                  const phi::complex128 *y,
+                  const int incy,
+                  phi::complex128 *result) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZdotu_v2(
+        handle,
+        n,
+        reinterpret_cast<const cuDoubleComplex *>(x),
+        incx,
+        reinterpret_cast<const cuDoubleComplex *>(y),
+        incy,
+        reinterpret_cast<cuDoubleComplex *>(result)));
+  }
 };
 
 inline void CheckGEMMNSize(int64_t N) {
@@ -2289,6 +2353,38 @@ void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
   });
 }
 
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::CUDOT(
+    int n, const T *x, int incx, const T *y, int incy, T *result) const {
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::DOT(handle, n, x, incx, y, incy, result);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::CUDOT(int n,
+                                         const phi::bfloat16 *x,
+                                         int incx,
+                                         const phi::bfloat16 *y,
+                                         int incy,
+                                         phi::bfloat16 *result) const {
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDotEx(handle,
+                                                         n,
+                                                         x,
+                                                         CUDA_R_16BF,
+                                                         incx,
+                                                         y,
+                                                         CUDA_R_16BF,
+                                                         incy,
+                                                         result,
+                                                         CUDA_R_16BF,
+                                                         CUDA_R_32F));
+  });
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/dot_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/phi/kernels/full_kernel.h"
@@ -36,14 +36,39 @@ void DotKernel(const Context& dev_ctx,
   if (out->numel() <= 0) {
     return;
   }
+  auto x_data = x.data<T>();
+  auto y_data = y.data<T>();
   dev_ctx.template Alloc<T>(out);
+  auto out_data = out->data<T>();
   if (out->dims().size() == 0) {
+#ifdef PADDLE_WITH_CUDA
+    if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
+      auto eigen_out = phi::EigenScalar<T>::From(*out);
+      auto eigen_x = phi::EigenVector<T>::Flatten(x);
+      auto eigen_y = phi::EigenVector<T>::Flatten(y);
+
+      auto& dev = *dev_ctx.eigen_device();
+      eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+    } else {
+      const int n = static_cast<int>(x.numel());
+      int incx = static_cast<int>(x.strides()[0]);
+      int incy = static_cast<int>(x.strides()[0]);
+      if (n == 1) {
+        incx = 1;
+        incy = 1;
+      }
+
+      auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+      blas.CUDOT(n, x_data, incx, y_data, incy, out_data);
+    }
+#else
     auto eigen_out = phi::EigenScalar<T>::From(*out);
     auto eigen_x = phi::EigenVector<T>::Flatten(x);
     auto eigen_y = phi::EigenVector<T>::Flatten(y);
 
     auto& dev = *dev_ctx.eigen_device();
     eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+#endif
   } else {
     auto eigen_out = phi::EigenVector<T>::From(*out);
     auto eigen_x = phi::EigenMatrix<T>::From(x);
@@ -53,7 +78,6 @@ void DotKernel(const Context& dev_ctx,
     eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
   }
 }
-
 }  // namespace phi
 
 using complex64 = phi::complex64;