@@ -180,59 +180,52 @@ __global__ void KernelPool2D(const int nthreads,
180180}
181181
182182template <typename PoolProcess, typename T>
183- __global__ void AdaptiveKernelPool2D (const int nthreads,
184- const T* input_data,
185- const int channels,
186- const int input_height,
187- const int input_width,
188- const int output_height,
189- const int output_width,
190- const int ksize_height,
191- const int ksize_width,
192- const int stride_height,
193- const int stride_width,
194- const int padding_height,
195- const int padding_width,
196- FastDivModForPooling divmods,
183+ __global__ void AdaptiveKernelPool2D (const T* input_data,
184+ const int64_t channels,
185+ const int64_t input_height,
186+ const int64_t input_width,
187+ const int64_t output_height,
188+ const int64_t output_width,
197189 PoolProcess pool_process,
198190 bool exclusive,
199191 T* output_data,
200192 bool channel_last = false ) {
201- const int n_offset = blockIdx .y ;
202- const int c_offset = blockIdx .x * blockDim .y + threadIdx .y ;
193+ const int64_t n_offset = blockIdx .y ;
194+ const int64_t c_offset = blockIdx .x * blockDim .y + threadIdx .y ;
203195 if (c_offset >= channels) {
204196 return ;
205197 }
206- int hstart, hend, wstart, wend;
207- int input_offset =
198+ int64_t hstart, hend, wstart, wend;
199+ int64_t input_offset =
208200 channel_last
209201 ? n_offset * input_height * input_width * channels
210202 : (n_offset * channels + c_offset) * input_height * input_width;
211- int output_offset =
203+ int64_t output_offset =
212204 channel_last
213205 ? n_offset * output_height * output_width * channels
214206 : (n_offset * channels + c_offset) * output_height * output_width;
215- for (int hw_offset = threadIdx .x ; hw_offset < output_height * output_width;
207+ for (int64_t hw_offset = threadIdx .x ;
208+ hw_offset < output_height * output_width;
216209 hw_offset += blockDim .x ) {
217- int w_offset = hw_offset % output_width;
218- int h_offset = hw_offset / output_width;
210+ int64_t w_offset = hw_offset % output_width;
211+ int64_t h_offset = hw_offset / output_width;
219212 hstart = AdaptStartIndex (h_offset, input_height, output_height);
220213 hend = AdaptEndIndex (h_offset, input_height, output_height);
221214 wstart = AdaptStartIndex (w_offset, input_width, output_width);
222215 wend = AdaptEndIndex (w_offset, input_width, output_width);
223216
224217 T ele = pool_process.initial ();
225- for (int h = hstart; h < hend; ++h) {
226- for (int w = wstart; w < wend; ++w) {
227- auto input_idx = channel_last
228- ? (h * input_width + w) * channels + c_offset
229- : h * input_width + w;
218+ for (int64_t h = hstart; h < hend; ++h) {
219+ for (int64_t w = wstart; w < wend; ++w) {
220+ int64_t input_idx = channel_last
221+ ? (h * input_width + w) * channels + c_offset
222+ : h * input_width + w;
230223 pool_process.compute (input_data[input_offset + input_idx], &ele);
231224 }
232225 }
233- int pool_size = (hend - hstart) * (wend - wstart);
226+ int64_t pool_size = (hend - hstart) * (wend - wstart);
234227 pool_process.finalize (static_cast <T>(pool_size), &ele);
235- int output_idx =
228+ int64_t output_idx =
236229 channel_last
237230 ? (h_offset * output_width + w_offset) * channels + c_offset
238231 : h_offset * output_width + w_offset;
@@ -478,20 +471,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
478471 batch_size,
479472 1 );
480473 AdaptiveKernelPool2D<PoolProcess, T>
481- <<<grid, threads, 0 , stream>>> (nthreads,
482- input,
474+ <<<grid, threads, 0 , stream>>> (input,
483475 input_channels,
484476 input_height,
485477 input_width,
486478 output_height,
487479 output_width,
488- ksize_height,
489- ksize_width,
490- stride_height,
491- stride_width,
492- padding_height,
493- padding_width,
494- pool_divmods,
495480 pool_compute,
496481 exclusive,
497482 output);
@@ -535,94 +520,6 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
535520template <typename PoolProcess, typename T>
536521class Pool2dFunctor <phi::GPUContext, PoolProcess, T> {
537522 public:
538- void operator ()(const phi::GPUContext& context,
539- const DenseTensor& input,
540- const std::vector<int >& ksize,
541- const std::vector<int >& strides,
542- const std::vector<int >& paddings,
543- bool exclusive,
544- bool adaptive,
545- DenseTensor* output,
546- PoolProcess pool_process) {
547- const int batch_size = input.dims ()[0 ];
548- const int input_channels = input.dims ()[1 ];
549- const int input_height = input.dims ()[2 ];
550- const int input_width = input.dims ()[3 ];
551- const int output_channels = output->dims ()[1 ];
552- const int output_height = output->dims ()[2 ];
553- const int output_width = output->dims ()[3 ];
554- const int ksize_height = ksize[0 ];
555- const int ksize_width = ksize[1 ];
556- const int stride_height = strides[0 ];
557- const int stride_width = strides[1 ];
558- const int padding_height = paddings[0 ];
559- const int padding_width = paddings[1 ];
560-
561- const T* input_data = input.data <T>();
562- T* output_data = context.template Alloc <T>(output);
563-
564- int64_t nthreads = static_cast <int64_t >(batch_size) * output_channels *
565- output_height * output_width;
566- auto pool_divmods =
567- FastDivModForPooling (input_channels, output_width, output_height);
568- if (adaptive) {
569- int64_t max_threads = 512 ;
570- int64_t thread_num = std::min (
571- phi::funcs::details::GetLastPow2 (output_height * output_width),
572- max_threads);
573- int64_t blocks = std::min (max_threads / thread_num,
574- static_cast <int64_t >(output_channels));
575- dim3 threads (thread_num, blocks, 1 );
576- dim3 grid (std::max ((output_channels + blocks - 1 ) / blocks,
577- static_cast <int64_t >(1 )),
578- batch_size,
579- 1 );
580- AdaptiveKernelPool2D<PoolProcess, T>
581- <<<grid, threads, 0 , context.stream()>>> (nthreads,
582- input_data,
583- input_channels,
584- input_height,
585- input_width,
586- output_height,
587- output_width,
588- ksize_height,
589- ksize_width,
590- stride_height,
591- stride_width,
592- padding_height,
593- padding_width,
594- pool_divmods,
595- pool_process,
596- exclusive,
597- output_data);
598- } else {
599- int thread_num = 1024 ;
600- #ifdef WITH_NV_JETSON
601- backends::gpu::ChangeThreadNum (context, &thread_num);
602- #endif
603- int blocks = (nthreads + thread_num - 1 ) / thread_num;
604- dim3 threads (thread_num, 1 );
605- dim3 grid (blocks, 1 );
606- KernelPool2D<PoolProcess, T>
607- <<<grid, threads, 0 , context.stream()>>> (nthreads,
608- input_data,
609- input_channels,
610- input_height,
611- input_width,
612- output_height,
613- output_width,
614- ksize_height,
615- ksize_width,
616- stride_height,
617- stride_width,
618- padding_height,
619- padding_width,
620- pool_divmods,
621- pool_process,
622- exclusive,
623- output_data);
624- }
625- }
626523 void operator ()(const phi::GPUContext& context,
627524 const DenseTensor& input,
628525 const std::vector<int >& ksize,
@@ -634,17 +531,20 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
634531 DenseTensor* output,
635532 PoolProcess pool_process) {
636533 bool channel_last = (data_format == " NHWC" );
637- const int batch_size = input.dims ()[0 ];
534+ const int64_t batch_size = input.dims ()[0 ];
638535
639- const int input_channels = channel_last ? input.dims ()[3 ] : input.dims ()[1 ];
640- const int input_height = channel_last ? input.dims ()[1 ] : input.dims ()[2 ];
641- const int input_width = channel_last ? input.dims ()[2 ] : input.dims ()[3 ];
536+ const int64_t input_channels =
537+ channel_last ? input.dims ()[3 ] : input.dims ()[1 ];
538+ const int64_t input_height =
539+ channel_last ? input.dims ()[1 ] : input.dims ()[2 ];
540+ const int64_t input_width =
541+ channel_last ? input.dims ()[2 ] : input.dims ()[3 ];
642542
643- const int output_channels =
543+ const int64_t output_channels =
644544 channel_last ? output->dims ()[3 ] : output->dims ()[1 ];
645- const int output_height =
545+ const int64_t output_height =
646546 channel_last ? output->dims ()[1 ] : output->dims ()[2 ];
647- const int output_width =
547+ const int64_t output_width =
648548 channel_last ? output->dims ()[2 ] : output->dims ()[3 ];
649549
650550 const int ksize_height = ksize[0 ];
@@ -659,37 +559,28 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
659559 const T* input_data = input.data <T>();
660560 T* output_data = context.template Alloc <T>(output);
661561
662- int64_t nthreads = static_cast < int64_t >(batch_size) * output_channels *
663- output_height * output_width;
562+ int64_t nthreads =
563+ batch_size * output_channels * output_height * output_width;
664564 auto pool_divmods =
665565 FastDivModForPooling (input_channels, output_width, output_height);
666566 if (adaptive) {
667567 int64_t max_threads = 512 ;
668568 int64_t thread_num = std::min (
669- phi::funcs::details::GetLastPow2 (output_height * output_width),
569+ phi::funcs::details::GetInt64LastPow2 (output_height * output_width),
670570 max_threads);
671- int64_t blocks = std::min (max_threads / thread_num,
672- static_cast <int64_t >(output_channels));
571+ int64_t blocks = std::min (max_threads / thread_num, output_channels);
673572 dim3 threads (thread_num, blocks, 1 );
674573 dim3 grid (std::max ((output_channels + blocks - 1 ) / blocks,
675574 static_cast <int64_t >(1 )),
676575 batch_size,
677576 1 );
678577 AdaptiveKernelPool2D<PoolProcess, T>
679- <<<grid, threads, 0 , context.stream()>>> (nthreads,
680- input_data,
578+ <<<grid, threads, 0 , context.stream()>>> (input_data,
681579 input_channels,
682580 input_height,
683581 input_width,
684582 output_height,
685583 output_width,
686- ksize_height,
687- ksize_width,
688- stride_height,
689- stride_width,
690- padding_height,
691- padding_width,
692- pool_divmods,
693584 pool_process,
694585 exclusive,
695586 output_data,
0 commit comments