Skip to content

Commit e7f06dd

Browse files
committed
[PHI] Fix adaptivate pool2d kernel for big tensor
1 parent 758b14c commit e7f06dd

File tree

3 files changed

+52
-235
lines changed

3 files changed

+52
-235
lines changed

paddle/phi/kernels/funcs/pooling.cc

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -29,83 +29,6 @@ namespace phi::funcs {
2929
template <typename PoolProcess, typename T>
3030
class Pool2dFunctor<CPUContext, PoolProcess, T> {
3131
public:
32-
void operator()(const CPUContext& context,
33-
const DenseTensor& input,
34-
const std::vector<int>& ksize,
35-
const std::vector<int>& strides,
36-
const std::vector<int>& paddings,
37-
bool exclusive,
38-
bool adaptive,
39-
DenseTensor* output,
40-
PoolProcess pool_process) {
41-
const int batch_size = static_cast<int>(input.dims()[0]);
42-
const int input_height = static_cast<int>(input.dims()[2]);
43-
const int input_width = static_cast<int>(input.dims()[3]);
44-
const int output_channels = static_cast<int>(output->dims()[1]);
45-
const int output_height = static_cast<int>(output->dims()[2]);
46-
const int output_width = static_cast<int>(output->dims()[3]);
47-
const int ksize_height = ksize[0];
48-
const int ksize_width = ksize[1];
49-
const int stride_height = strides[0];
50-
const int stride_width = strides[1];
51-
const int padding_height = paddings[0];
52-
const int padding_width = paddings[1];
53-
54-
const int input_stride = input_height * input_width;
55-
const int output_stride = output_height * output_width;
56-
57-
const T* input_data = input.data<T>();
58-
T* output_data = context.template Alloc<T>(output);
59-
60-
int hstart = 0, hend = 1;
61-
int wstart = 0, wend = 1;
62-
for (int i = 0; i < batch_size; i++) {
63-
for (int c = 0; c < output_channels; ++c) {
64-
for (int ph = 0; ph < output_height; ++ph) {
65-
if (adaptive) {
66-
hstart = AdaptStartIndex(ph, input_height, output_height);
67-
hend = AdaptEndIndex(ph, input_height, output_height);
68-
}
69-
for (int pw = 0; pw < output_width; ++pw) {
70-
int pool_size = 1;
71-
if (adaptive) {
72-
wstart = AdaptStartIndex(pw, input_width, output_width);
73-
wend = AdaptEndIndex(pw, input_width, output_width);
74-
} else {
75-
hstart = ph * stride_height - padding_height;
76-
wstart = pw * stride_width - padding_width;
77-
hend = std::min(hstart + ksize_height,
78-
input_height + padding_height);
79-
wend =
80-
std::min(wstart + ksize_width, input_width + padding_width);
81-
pool_size = (hend - hstart) * (wend - wstart);
82-
83-
wstart = std::max(wstart, 0);
84-
hstart = std::max(hstart, 0);
85-
hend = std::min(hend, input_height);
86-
wend = std::min(wend, input_width);
87-
}
88-
89-
T ele = pool_process.initial();
90-
for (int h = hstart; h < hend; ++h) {
91-
for (int w = wstart; w < wend; ++w) {
92-
pool_process.compute(input_data[h * input_width + w], &ele);
93-
}
94-
}
95-
if (exclusive || adaptive) {
96-
pool_size = (hend - hstart) * (wend - wstart);
97-
}
98-
99-
pool_process.finalize(static_cast<T>(pool_size), &ele);
100-
output_data[ph * output_width + pw] = ele;
101-
}
102-
}
103-
input_data += input_stride;
104-
output_data += output_stride;
105-
}
106-
}
107-
}
108-
10932
void operator()(const CPUContext& context,
11033
const DenseTensor& input,
11134
const std::vector<int>& ksize,

paddle/phi/kernels/funcs/pooling.cu

Lines changed: 38 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -180,59 +180,52 @@ __global__ void KernelPool2D(const int nthreads,
180180
}
181181

182182
template <typename PoolProcess, typename T>
183-
__global__ void AdaptiveKernelPool2D(const int nthreads,
184-
const T* input_data,
185-
const int channels,
186-
const int input_height,
187-
const int input_width,
188-
const int output_height,
189-
const int output_width,
190-
const int ksize_height,
191-
const int ksize_width,
192-
const int stride_height,
193-
const int stride_width,
194-
const int padding_height,
195-
const int padding_width,
196-
FastDivModForPooling divmods,
183+
__global__ void AdaptiveKernelPool2D(const T* input_data,
184+
const int64_t channels,
185+
const int64_t input_height,
186+
const int64_t input_width,
187+
const int64_t output_height,
188+
const int64_t output_width,
197189
PoolProcess pool_process,
198190
bool exclusive,
199191
T* output_data,
200192
bool channel_last = false) {
201-
const int n_offset = blockIdx.y;
202-
const int c_offset = blockIdx.x * blockDim.y + threadIdx.y;
193+
const int64_t n_offset = blockIdx.y;
194+
const int64_t c_offset = blockIdx.x * blockDim.y + threadIdx.y;
203195
if (c_offset >= channels) {
204196
return;
205197
}
206-
int hstart, hend, wstart, wend;
207-
int input_offset =
198+
int64_t hstart, hend, wstart, wend;
199+
int64_t input_offset =
208200
channel_last
209201
? n_offset * input_height * input_width * channels
210202
: (n_offset * channels + c_offset) * input_height * input_width;
211-
int output_offset =
203+
int64_t output_offset =
212204
channel_last
213205
? n_offset * output_height * output_width * channels
214206
: (n_offset * channels + c_offset) * output_height * output_width;
215-
for (int hw_offset = threadIdx.x; hw_offset < output_height * output_width;
207+
for (int64_t hw_offset = threadIdx.x;
208+
hw_offset < output_height * output_width;
216209
hw_offset += blockDim.x) {
217-
int w_offset = hw_offset % output_width;
218-
int h_offset = hw_offset / output_width;
210+
int64_t w_offset = hw_offset % output_width;
211+
int64_t h_offset = hw_offset / output_width;
219212
hstart = AdaptStartIndex(h_offset, input_height, output_height);
220213
hend = AdaptEndIndex(h_offset, input_height, output_height);
221214
wstart = AdaptStartIndex(w_offset, input_width, output_width);
222215
wend = AdaptEndIndex(w_offset, input_width, output_width);
223216

224217
T ele = pool_process.initial();
225-
for (int h = hstart; h < hend; ++h) {
226-
for (int w = wstart; w < wend; ++w) {
227-
auto input_idx = channel_last
228-
? (h * input_width + w) * channels + c_offset
229-
: h * input_width + w;
218+
for (int64_t h = hstart; h < hend; ++h) {
219+
for (int64_t w = wstart; w < wend; ++w) {
220+
int64_t input_idx = channel_last
221+
? (h * input_width + w) * channels + c_offset
222+
: h * input_width + w;
230223
pool_process.compute(input_data[input_offset + input_idx], &ele);
231224
}
232225
}
233-
int pool_size = (hend - hstart) * (wend - wstart);
226+
int64_t pool_size = (hend - hstart) * (wend - wstart);
234227
pool_process.finalize(static_cast<T>(pool_size), &ele);
235-
int output_idx =
228+
int64_t output_idx =
236229
channel_last
237230
? (h_offset * output_width + w_offset) * channels + c_offset
238231
: h_offset * output_width + w_offset;
@@ -478,20 +471,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
478471
batch_size,
479472
1);
480473
AdaptiveKernelPool2D<PoolProcess, T>
481-
<<<grid, threads, 0, stream>>>(nthreads,
482-
input,
474+
<<<grid, threads, 0, stream>>>(input,
483475
input_channels,
484476
input_height,
485477
input_width,
486478
output_height,
487479
output_width,
488-
ksize_height,
489-
ksize_width,
490-
stride_height,
491-
stride_width,
492-
padding_height,
493-
padding_width,
494-
pool_divmods,
495480
pool_compute,
496481
exclusive,
497482
output);
@@ -535,94 +520,6 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
535520
template <typename PoolProcess, typename T>
536521
class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
537522
public:
538-
void operator()(const phi::GPUContext& context,
539-
const DenseTensor& input,
540-
const std::vector<int>& ksize,
541-
const std::vector<int>& strides,
542-
const std::vector<int>& paddings,
543-
bool exclusive,
544-
bool adaptive,
545-
DenseTensor* output,
546-
PoolProcess pool_process) {
547-
const int batch_size = input.dims()[0];
548-
const int input_channels = input.dims()[1];
549-
const int input_height = input.dims()[2];
550-
const int input_width = input.dims()[3];
551-
const int output_channels = output->dims()[1];
552-
const int output_height = output->dims()[2];
553-
const int output_width = output->dims()[3];
554-
const int ksize_height = ksize[0];
555-
const int ksize_width = ksize[1];
556-
const int stride_height = strides[0];
557-
const int stride_width = strides[1];
558-
const int padding_height = paddings[0];
559-
const int padding_width = paddings[1];
560-
561-
const T* input_data = input.data<T>();
562-
T* output_data = context.template Alloc<T>(output);
563-
564-
int64_t nthreads = static_cast<int64_t>(batch_size) * output_channels *
565-
output_height * output_width;
566-
auto pool_divmods =
567-
FastDivModForPooling(input_channels, output_width, output_height);
568-
if (adaptive) {
569-
int64_t max_threads = 512;
570-
int64_t thread_num = std::min(
571-
phi::funcs::details::GetLastPow2(output_height * output_width),
572-
max_threads);
573-
int64_t blocks = std::min(max_threads / thread_num,
574-
static_cast<int64_t>(output_channels));
575-
dim3 threads(thread_num, blocks, 1);
576-
dim3 grid(std::max((output_channels + blocks - 1) / blocks,
577-
static_cast<int64_t>(1)),
578-
batch_size,
579-
1);
580-
AdaptiveKernelPool2D<PoolProcess, T>
581-
<<<grid, threads, 0, context.stream()>>>(nthreads,
582-
input_data,
583-
input_channels,
584-
input_height,
585-
input_width,
586-
output_height,
587-
output_width,
588-
ksize_height,
589-
ksize_width,
590-
stride_height,
591-
stride_width,
592-
padding_height,
593-
padding_width,
594-
pool_divmods,
595-
pool_process,
596-
exclusive,
597-
output_data);
598-
} else {
599-
int thread_num = 1024;
600-
#ifdef WITH_NV_JETSON
601-
backends::gpu::ChangeThreadNum(context, &thread_num);
602-
#endif
603-
int blocks = (nthreads + thread_num - 1) / thread_num;
604-
dim3 threads(thread_num, 1);
605-
dim3 grid(blocks, 1);
606-
KernelPool2D<PoolProcess, T>
607-
<<<grid, threads, 0, context.stream()>>>(nthreads,
608-
input_data,
609-
input_channels,
610-
input_height,
611-
input_width,
612-
output_height,
613-
output_width,
614-
ksize_height,
615-
ksize_width,
616-
stride_height,
617-
stride_width,
618-
padding_height,
619-
padding_width,
620-
pool_divmods,
621-
pool_process,
622-
exclusive,
623-
output_data);
624-
}
625-
}
626523
void operator()(const phi::GPUContext& context,
627524
const DenseTensor& input,
628525
const std::vector<int>& ksize,
@@ -634,17 +531,20 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
634531
DenseTensor* output,
635532
PoolProcess pool_process) {
636533
bool channel_last = (data_format == "NHWC");
637-
const int batch_size = input.dims()[0];
534+
const int64_t batch_size = input.dims()[0];
638535

639-
const int input_channels = channel_last ? input.dims()[3] : input.dims()[1];
640-
const int input_height = channel_last ? input.dims()[1] : input.dims()[2];
641-
const int input_width = channel_last ? input.dims()[2] : input.dims()[3];
536+
const int64_t input_channels =
537+
channel_last ? input.dims()[3] : input.dims()[1];
538+
const int64_t input_height =
539+
channel_last ? input.dims()[1] : input.dims()[2];
540+
const int64_t input_width =
541+
channel_last ? input.dims()[2] : input.dims()[3];
642542

643-
const int output_channels =
543+
const int64_t output_channels =
644544
channel_last ? output->dims()[3] : output->dims()[1];
645-
const int output_height =
545+
const int64_t output_height =
646546
channel_last ? output->dims()[1] : output->dims()[2];
647-
const int output_width =
547+
const int64_t output_width =
648548
channel_last ? output->dims()[2] : output->dims()[3];
649549

650550
const int ksize_height = ksize[0];
@@ -659,37 +559,28 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
659559
const T* input_data = input.data<T>();
660560
T* output_data = context.template Alloc<T>(output);
661561

662-
int64_t nthreads = static_cast<int64_t>(batch_size) * output_channels *
663-
output_height * output_width;
562+
int64_t nthreads =
563+
batch_size * output_channels * output_height * output_width;
664564
auto pool_divmods =
665565
FastDivModForPooling(input_channels, output_width, output_height);
666566
if (adaptive) {
667567
int64_t max_threads = 512;
668568
int64_t thread_num = std::min(
669-
phi::funcs::details::GetLastPow2(output_height * output_width),
569+
phi::funcs::details::GetInt64LastPow2(output_height * output_width),
670570
max_threads);
671-
int64_t blocks = std::min(max_threads / thread_num,
672-
static_cast<int64_t>(output_channels));
571+
int64_t blocks = std::min(max_threads / thread_num, output_channels);
673572
dim3 threads(thread_num, blocks, 1);
674573
dim3 grid(std::max((output_channels + blocks - 1) / blocks,
675574
static_cast<int64_t>(1)),
676575
batch_size,
677576
1);
678577
AdaptiveKernelPool2D<PoolProcess, T>
679-
<<<grid, threads, 0, context.stream()>>>(nthreads,
680-
input_data,
578+
<<<grid, threads, 0, context.stream()>>>(input_data,
681579
input_channels,
682580
input_height,
683581
input_width,
684582
output_height,
685583
output_width,
686-
ksize_height,
687-
ksize_width,
688-
stride_height,
689-
stride_width,
690-
padding_height,
691-
padding_width,
692-
pool_divmods,
693584
pool_process,
694585
exclusive,
695586
output_data,

paddle/phi/kernels/funcs/pooling.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,20 @@ limitations under the License. */
3030
namespace phi {
3131
namespace funcs {
3232

33+
namespace details {
34+
static inline int64_t GetInt64LastPow2(int64_t x) {
35+
if (x <= 0) return 0;
36+
uint64_t ux = x;
37+
ux |= (ux >> 1);
38+
ux |= (ux >> 2);
39+
ux |= (ux >> 4);
40+
ux |= (ux >> 8);
41+
ux |= (ux >> 16);
42+
ux |= (ux >> 32);
43+
return static_cast<int64_t>(ux - (ux >> 1));
44+
}
45+
} // namespace details
46+
3347
/*
3448
* \brief Extracting simple operations from pooling.
3549
* Both MaxPool and AvgPool need "initial", "compute" and "finalize"
@@ -211,17 +225,6 @@ class Pool2dDirectCUDAFunctor {
211225
template <typename Context, typename PoolProcess, typename T>
212226
class Pool2dFunctor {
213227
public:
214-
void operator()(const Context& context,
215-
const DenseTensor& input,
216-
const std::vector<int>& ksize,
217-
const std::vector<int>& strides,
218-
const std::vector<int>& paddings,
219-
bool exclusive,
220-
bool adaptive,
221-
DenseTensor* output,
222-
PoolProcess pool_compute);
223-
224-
// overload operator() to support argument data_format
225228
void operator()(const Context& context,
226229
const DenseTensor& input,
227230
const std::vector<int>& ksize,

0 commit comments

Comments
 (0)