[PHI] Fix adaptivate pool2d kernel for big tensor

huangjiyi · huangjiyi · commit e7f06dda6c07 · 2025-05-09T16:48:56.000+08:00
diff --git a/paddle/phi/kernels/funcs/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc
@@ -29,83 +29,6 @@ namespace phi::funcs {
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<CPUContext, PoolProcess, T> {
  public:
-  void operator()(const CPUContext& context,
-                  const DenseTensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool exclusive,
-                  bool adaptive,
-                  DenseTensor* output,
-                  PoolProcess pool_process) {
-    const int batch_size = static_cast<int>(input.dims()[0]);
-    const int input_height = static_cast<int>(input.dims()[2]);
-    const int input_width = static_cast<int>(input.dims()[3]);
-    const int output_channels = static_cast<int>(output->dims()[1]);
-    const int output_height = static_cast<int>(output->dims()[2]);
-    const int output_width = static_cast<int>(output->dims()[3]);
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = context.template Alloc<T>(output);
-
-    int hstart = 0, hend = 1;
-    int wstart = 0, wend = 1;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            int pool_size = 1;
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              wstart = pw * stride_width - padding_width;
-              hend = std::min(hstart + ksize_height,
-                              input_height + padding_height);
-              wend =
-                  std::min(wstart + ksize_width, input_width + padding_width);
-              pool_size = (hend - hstart) * (wend - wstart);
-
-              wstart = std::max(wstart, 0);
-              hstart = std::max(hstart, 0);
-              hend = std::min(hend, input_height);
-              wend = std::min(wend, input_width);
-            }
-
-            T ele = pool_process.initial();
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(input_data[h * input_width + w], &ele);
-              }
-            }
-            if (exclusive || adaptive) {
-              pool_size = (hend - hstart) * (wend - wstart);
-            }
-
-            pool_process.finalize(static_cast<T>(pool_size), &ele);
-            output_data[ph * output_width + pw] = ele;
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-
   void operator()(const CPUContext& context,
                   const DenseTensor& input,
                   const std::vector<int>& ksize,
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
@@ -180,59 +180,52 @@ __global__ void KernelPool2D(const int nthreads,
 }
 
 template <typename PoolProcess, typename T>
-__global__ void AdaptiveKernelPool2D(const int nthreads,
-                                     const T* input_data,
-                                     const int channels,
-                                     const int input_height,
-                                     const int input_width,
-                                     const int output_height,
-                                     const int output_width,
-                                     const int ksize_height,
-                                     const int ksize_width,
-                                     const int stride_height,
-                                     const int stride_width,
-                                     const int padding_height,
-                                     const int padding_width,
-                                     FastDivModForPooling divmods,
+__global__ void AdaptiveKernelPool2D(const T* input_data,
+                                     const int64_t channels,
+                                     const int64_t input_height,
+                                     const int64_t input_width,
+                                     const int64_t output_height,
+                                     const int64_t output_width,
                                      PoolProcess pool_process,
                                      bool exclusive,
                                      T* output_data,
                                      bool channel_last = false) {
-  const int n_offset = blockIdx.y;
-  const int c_offset = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t n_offset = blockIdx.y;
+  const int64_t c_offset = blockIdx.x * blockDim.y + threadIdx.y;
   if (c_offset >= channels) {
     return;
   }
-  int hstart, hend, wstart, wend;
-  int input_offset =
+  int64_t hstart, hend, wstart, wend;
+  int64_t input_offset =
       channel_last
           ? n_offset * input_height * input_width * channels
           : (n_offset * channels + c_offset) * input_height * input_width;
-  int output_offset =
+  int64_t output_offset =
       channel_last
           ? n_offset * output_height * output_width * channels
           : (n_offset * channels + c_offset) * output_height * output_width;
-  for (int hw_offset = threadIdx.x; hw_offset < output_height * output_width;
+  for (int64_t hw_offset = threadIdx.x;
+       hw_offset < output_height * output_width;
        hw_offset += blockDim.x) {
-    int w_offset = hw_offset % output_width;
-    int h_offset = hw_offset / output_width;
+    int64_t w_offset = hw_offset % output_width;
+    int64_t h_offset = hw_offset / output_width;
     hstart = AdaptStartIndex(h_offset, input_height, output_height);
     hend = AdaptEndIndex(h_offset, input_height, output_height);
     wstart = AdaptStartIndex(w_offset, input_width, output_width);
     wend = AdaptEndIndex(w_offset, input_width, output_width);
 
     T ele = pool_process.initial();
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        auto input_idx = channel_last
-                             ? (h * input_width + w) * channels + c_offset
-                             : h * input_width + w;
+    for (int64_t h = hstart; h < hend; ++h) {
+      for (int64_t w = wstart; w < wend; ++w) {
+        int64_t input_idx = channel_last
+                                ? (h * input_width + w) * channels + c_offset
+                                : h * input_width + w;
         pool_process.compute(input_data[input_offset + input_idx], &ele);
       }
     }
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int64_t pool_size = (hend - hstart) * (wend - wstart);
     pool_process.finalize(static_cast<T>(pool_size), &ele);
-    int output_idx =
+    int64_t output_idx =
         channel_last
             ? (h_offset * output_width + w_offset) * channels + c_offset
             : h_offset * output_width + w_offset;
@@ -478,20 +471,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
               batch_size,
               1);
     AdaptiveKernelPool2D<PoolProcess, T>
-        <<<grid, threads, 0, stream>>>(nthreads,
-                                       input,
+        <<<grid, threads, 0, stream>>>(input,
                                        input_channels,
                                        input_height,
                                        input_width,
                                        output_height,
                                        output_width,
-                                       ksize_height,
-                                       ksize_width,
-                                       stride_height,
-                                       stride_width,
-                                       padding_height,
-                                       padding_width,
-                                       pool_divmods,
                                        pool_compute,
                                        exclusive,
                                        output);
@@ -535,94 +520,6 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
  public:
-  void operator()(const phi::GPUContext& context,
-                  const DenseTensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool exclusive,
-                  bool adaptive,
-                  DenseTensor* output,
-                  PoolProcess pool_process) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[1];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const T* input_data = input.data<T>();
-    T* output_data = context.template Alloc<T>(output);
-
-    int64_t nthreads = static_cast<int64_t>(batch_size) * output_channels *
-                       output_height * output_width;
-    auto pool_divmods =
-        FastDivModForPooling(input_channels, output_width, output_height);
-    if (adaptive) {
-      int64_t max_threads = 512;
-      int64_t thread_num = std::min(
-          phi::funcs::details::GetLastPow2(output_height * output_width),
-          max_threads);
-      int64_t blocks = std::min(max_threads / thread_num,
-                                static_cast<int64_t>(output_channels));
-      dim3 threads(thread_num, blocks, 1);
-      dim3 grid(std::max((output_channels + blocks - 1) / blocks,
-                         static_cast<int64_t>(1)),
-                batch_size,
-                1);
-      AdaptiveKernelPool2D<PoolProcess, T>
-          <<<grid, threads, 0, context.stream()>>>(nthreads,
-                                                   input_data,
-                                                   input_channels,
-                                                   input_height,
-                                                   input_width,
-                                                   output_height,
-                                                   output_width,
-                                                   ksize_height,
-                                                   ksize_width,
-                                                   stride_height,
-                                                   stride_width,
-                                                   padding_height,
-                                                   padding_width,
-                                                   pool_divmods,
-                                                   pool_process,
-                                                   exclusive,
-                                                   output_data);
-    } else {
-      int thread_num = 1024;
-#ifdef WITH_NV_JETSON
-      backends::gpu::ChangeThreadNum(context, &thread_num);
-#endif
-      int blocks = (nthreads + thread_num - 1) / thread_num;
-      dim3 threads(thread_num, 1);
-      dim3 grid(blocks, 1);
-      KernelPool2D<PoolProcess, T>
-          <<<grid, threads, 0, context.stream()>>>(nthreads,
-                                                   input_data,
-                                                   input_channels,
-                                                   input_height,
-                                                   input_width,
-                                                   output_height,
-                                                   output_width,
-                                                   ksize_height,
-                                                   ksize_width,
-                                                   stride_height,
-                                                   stride_width,
-                                                   padding_height,
-                                                   padding_width,
-                                                   pool_divmods,
-                                                   pool_process,
-                                                   exclusive,
-                                                   output_data);
-    }
-  }
   void operator()(const phi::GPUContext& context,
                   const DenseTensor& input,
                   const std::vector<int>& ksize,
@@ -634,17 +531,20 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
                   DenseTensor* output,
                   PoolProcess pool_process) {
     bool channel_last = (data_format == "NHWC");
-    const int batch_size = input.dims()[0];
+    const int64_t batch_size = input.dims()[0];
 
-    const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
-    const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
-    const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
+    const int64_t input_channels =
+        channel_last ? input.dims()[3] : input.dims()[1];
+    const int64_t input_height =
+        channel_last ? input.dims()[1] : input.dims()[2];
+    const int64_t input_width =
+        channel_last ? input.dims()[2] : input.dims()[3];
 
-    const int output_channels =
+    const int64_t output_channels =
         channel_last ? output->dims()[3] : output->dims()[1];
-    const int output_height =
+    const int64_t output_height =
         channel_last ? output->dims()[1] : output->dims()[2];
-    const int output_width =
+    const int64_t output_width =
         channel_last ? output->dims()[2] : output->dims()[3];
 
     const int ksize_height = ksize[0];
@@ -659,37 +559,28 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = context.template Alloc<T>(output);
 
-    int64_t nthreads = static_cast<int64_t>(batch_size) * output_channels *
-                       output_height * output_width;
+    int64_t nthreads =
+        batch_size * output_channels * output_height * output_width;
     auto pool_divmods =
         FastDivModForPooling(input_channels, output_width, output_height);
     if (adaptive) {
       int64_t max_threads = 512;
       int64_t thread_num = std::min(
-          phi::funcs::details::GetLastPow2(output_height * output_width),
+          phi::funcs::details::GetInt64LastPow2(output_height * output_width),
           max_threads);
-      int64_t blocks = std::min(max_threads / thread_num,
-                                static_cast<int64_t>(output_channels));
+      int64_t blocks = std::min(max_threads / thread_num, output_channels);
       dim3 threads(thread_num, blocks, 1);
       dim3 grid(std::max((output_channels + blocks - 1) / blocks,
                          static_cast<int64_t>(1)),
                 batch_size,
                 1);
       AdaptiveKernelPool2D<PoolProcess, T>
-          <<<grid, threads, 0, context.stream()>>>(nthreads,
-                                                   input_data,
+          <<<grid, threads, 0, context.stream()>>>(input_data,
                                                    input_channels,
                                                    input_height,
                                                    input_width,
                                                    output_height,
                                                    output_width,
-                                                   ksize_height,
-                                                   ksize_width,
-                                                   stride_height,
-                                                   stride_width,
-                                                   padding_height,
-                                                   padding_width,
-                                                   pool_divmods,
                                                    pool_process,
                                                    exclusive,
                                                    output_data,
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
@@ -30,6 +30,20 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
+namespace details {
+static inline int64_t GetInt64LastPow2(int64_t x) {
+  if (x <= 0) return 0;
+  uint64_t ux = x;
+  ux |= (ux >> 1);
+  ux |= (ux >> 2);
+  ux |= (ux >> 4);
+  ux |= (ux >> 8);
+  ux |= (ux >> 16);
+  ux |= (ux >> 32);
+  return static_cast<int64_t>(ux - (ux >> 1));
+}
+}  // namespace details
+
 /*
  * \brief Extracting simple operations from pooling.
  *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
@@ -211,17 +225,6 @@ class Pool2dDirectCUDAFunctor {
 template <typename Context, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
-  void operator()(const Context& context,
-                  const DenseTensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool exclusive,
-                  bool adaptive,
-                  DenseTensor* output,
-                  PoolProcess pool_compute);
-
-  // overload operator() to support argument data_format
   void operator()(const Context& context,
                   const DenseTensor& input,
                   const std::vector<int>& ksize,