@@ -186,59 +186,52 @@ __global__ void KernelPool2D(const int nthreads,
186186}
187187
188188template <typename PoolProcess, typename T>
189- __global__ void AdaptiveKernelPool2D (const int nthreads,
190- const T* input_data,
191- const int channels,
192- const int input_height,
193- const int input_width,
194- const int output_height,
195- const int output_width,
196- const int ksize_height,
197- const int ksize_width,
198- const int stride_height,
199- const int stride_width,
200- const int padding_height,
201- const int padding_width,
202- FastDivModForPooling divmods,
189+ __global__ void AdaptiveKernelPool2D (const T* input_data,
190+ const int64_t channels,
191+ const int64_t input_height,
192+ const int64_t input_width,
193+ const int64_t output_height,
194+ const int64_t output_width,
203195 PoolProcess pool_process,
204196 bool exclusive,
205197 T* output_data,
206198 bool channel_last = false ) {
207- const int n_offset = blockIdx .y ;
208- const int c_offset = blockIdx .x * blockDim .y + threadIdx .y ;
199+ const int64_t n_offset = blockIdx .y ;
200+ const int64_t c_offset = blockIdx .x * blockDim .y + threadIdx .y ;
209201 if (c_offset >= channels) {
210202 return ;
211203 }
212- int hstart, hend, wstart, wend;
213- int input_offset =
204+ int64_t hstart, hend, wstart, wend;
205+ int64_t input_offset =
214206 channel_last
215207 ? n_offset * input_height * input_width * channels
216208 : (n_offset * channels + c_offset) * input_height * input_width;
217- int output_offset =
209+ int64_t output_offset =
218210 channel_last
219211 ? n_offset * output_height * output_width * channels
220212 : (n_offset * channels + c_offset) * output_height * output_width;
221- for (int hw_offset = threadIdx .x ; hw_offset < output_height * output_width;
213+ for (int64_t hw_offset = threadIdx .x ;
214+ hw_offset < output_height * output_width;
222215 hw_offset += blockDim .x ) {
223- int w_offset = hw_offset % output_width;
224- int h_offset = hw_offset / output_width;
216+ int64_t w_offset = hw_offset % output_width;
217+ int64_t h_offset = hw_offset / output_width;
225218 hstart = AdaptStartIndex (h_offset, input_height, output_height);
226219 hend = AdaptEndIndex (h_offset, input_height, output_height);
227220 wstart = AdaptStartIndex (w_offset, input_width, output_width);
228221 wend = AdaptEndIndex (w_offset, input_width, output_width);
229222
230223 T ele = pool_process.initial ();
231- for (int h = hstart; h < hend; ++h) {
232- for (int w = wstart; w < wend; ++w) {
233- auto input_idx = channel_last
234- ? (h * input_width + w) * channels + c_offset
235- : h * input_width + w;
224+ for (int64_t h = hstart; h < hend; ++h) {
225+ for (int64_t w = wstart; w < wend; ++w) {
226+ int64_t input_idx = channel_last
227+ ? (h * input_width + w) * channels + c_offset
228+ : h * input_width + w;
236229 pool_process.compute (input_data[input_offset + input_idx], &ele);
237230 }
238231 }
239- int pool_size = (hend - hstart) * (wend - wstart);
232+ int64_t pool_size = (hend - hstart) * (wend - wstart);
240233 pool_process.finalize (static_cast <T>(pool_size), &ele);
241- int output_idx =
234+ int64_t output_idx =
242235 channel_last
243236 ? (h_offset * output_width + w_offset) * channels + c_offset
244237 : h_offset * output_width + w_offset;
@@ -480,20 +473,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
480473 dim3 grid (
481474 std::max ((output_channels + blocks - 1 ) / blocks, 1 ), batch_size, 1 );
482475 AdaptiveKernelPool2D<PoolProcess, T>
483- <<<grid, threads, 0 , stream>>> (nthreads,
484- input,
476+ <<<grid, threads, 0 , stream>>> (input,
485477 input_channels,
486478 input_height,
487479 input_width,
488480 output_height,
489481 output_width,
490- ksize_height,
491- ksize_width,
492- stride_height,
493- stride_width,
494- padding_height,
495- padding_width,
496- pool_divmods,
497482 pool_compute,
498483 exclusive,
499484 output);
@@ -537,90 +522,6 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
537522template <typename PoolProcess, typename T>
538523class Pool2dFunctor <phi::GPUContext, PoolProcess, T> {
539524 public:
540- void operator ()(const phi::GPUContext& context,
541- const DenseTensor& input,
542- const std::vector<int >& ksize,
543- const std::vector<int >& strides,
544- const std::vector<int >& paddings,
545- bool exclusive,
546- bool adaptive,
547- DenseTensor* output,
548- PoolProcess pool_process) {
549- const int batch_size = input.dims ()[0 ];
550- const int input_channels = input.dims ()[1 ];
551- const int input_height = input.dims ()[2 ];
552- const int input_width = input.dims ()[3 ];
553- const int output_channels = output->dims ()[1 ];
554- const int output_height = output->dims ()[2 ];
555- const int output_width = output->dims ()[3 ];
556- const int ksize_height = ksize[0 ];
557- const int ksize_width = ksize[1 ];
558- const int stride_height = strides[0 ];
559- const int stride_width = strides[1 ];
560- const int padding_height = paddings[0 ];
561- const int padding_width = paddings[1 ];
562-
563- const T* input_data = input.data <T>();
564- T* output_data = context.template Alloc <T>(output);
565-
566- int nthreads = batch_size * output_channels * output_height * output_width;
567- auto pool_divmods =
568- FastDivModForPooling (input_channels, output_width, output_height);
569- if (adaptive) {
570- int max_threads = 512 ;
571- int thread_num = std::min (
572- phi::funcs::details::GetLastPow2 (output_height * output_width),
573- max_threads);
574- int blocks = std::min (max_threads / thread_num, output_channels);
575- dim3 threads (thread_num, blocks, 1 );
576- dim3 grid (
577- std::max ((output_channels + blocks - 1 ) / blocks, 1 ), batch_size, 1 );
578- AdaptiveKernelPool2D<PoolProcess, T>
579- <<<grid, threads, 0 , context.stream()>>> (nthreads,
580- input_data,
581- input_channels,
582- input_height,
583- input_width,
584- output_height,
585- output_width,
586- ksize_height,
587- ksize_width,
588- stride_height,
589- stride_width,
590- padding_height,
591- padding_width,
592- pool_divmods,
593- pool_process,
594- exclusive,
595- output_data);
596- } else {
597- int thread_num = 1024 ;
598- #ifdef WITH_NV_JETSON
599- backends::gpu::ChangeThreadNum (context, &thread_num);
600- #endif
601- int blocks = (nthreads + thread_num - 1 ) / thread_num;
602- dim3 threads (thread_num, 1 );
603- dim3 grid (blocks, 1 );
604- KernelPool2D<PoolProcess, T>
605- <<<grid, threads, 0 , context.stream()>>> (nthreads,
606- input_data,
607- input_channels,
608- input_height,
609- input_width,
610- output_height,
611- output_width,
612- ksize_height,
613- ksize_width,
614- stride_height,
615- stride_width,
616- padding_height,
617- padding_width,
618- pool_divmods,
619- pool_process,
620- exclusive,
621- output_data);
622- }
623- }
624525 void operator ()(const phi::GPUContext& context,
625526 const DenseTensor& input,
626527 const std::vector<int >& ksize,
@@ -632,17 +533,20 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
632533 DenseTensor* output,
633534 PoolProcess pool_process) {
634535 bool channel_last = (data_format == " NHWC" );
635- const int batch_size = input.dims ()[0 ];
536+ const int64_t batch_size = input.dims ()[0 ];
636537
637- const int input_channels = channel_last ? input.dims ()[3 ] : input.dims ()[1 ];
638- const int input_height = channel_last ? input.dims ()[1 ] : input.dims ()[2 ];
639- const int input_width = channel_last ? input.dims ()[2 ] : input.dims ()[3 ];
538+ const int64_t input_channels =
539+ channel_last ? input.dims ()[3 ] : input.dims ()[1 ];
540+ const int64_t input_height =
541+ channel_last ? input.dims ()[1 ] : input.dims ()[2 ];
542+ const int64_t input_width =
543+ channel_last ? input.dims ()[2 ] : input.dims ()[3 ];
640544
641- const int output_channels =
545+ const int64_t output_channels =
642546 channel_last ? output->dims ()[3 ] : output->dims ()[1 ];
643- const int output_height =
547+ const int64_t output_height =
644548 channel_last ? output->dims ()[1 ] : output->dims ()[2 ];
645- const int output_width =
549+ const int64_t output_width =
646550 channel_last ? output->dims ()[2 ] : output->dims ()[3 ];
647551
648552 const int ksize_height = ksize[0 ];
@@ -657,33 +561,26 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
657561 const T* input_data = input.data <T>();
658562 T* output_data = context.template Alloc <T>(output);
659563
660- int nthreads = batch_size * output_channels * output_height * output_width;
564+ int64_t nthreads =
565+ batch_size * output_channels * output_height * output_width;
661566 auto pool_divmods =
662567 FastDivModForPooling (input_channels, output_width, output_height);
663568 if (adaptive) {
664- int max_threads = 512 ;
665- int thread_num = std::min (
666- phi::funcs::details::GetLastPow2 (output_height * output_width),
569+ int64_t max_threads = 512 ;
570+ int64_t thread_num = std::min (
571+ phi::funcs::details::GetInt64LastPow2 (output_height * output_width),
667572 max_threads);
668- int blocks = std::min (max_threads / thread_num, output_channels);
573+ int64_t blocks = std::min (max_threads / thread_num, output_channels);
669574 dim3 threads (thread_num, blocks, 1 );
670575 dim3 grid (
671- std::max ((output_channels + blocks - 1 ) / blocks, 1 ), batch_size, 1 );
576+ std::max ((output_channels + blocks - 1 ) / blocks, 1l ), batch_size, 1 );
672577 AdaptiveKernelPool2D<PoolProcess, T>
673- <<<grid, threads, 0 , context.stream()>>> (nthreads,
674- input_data,
578+ <<<grid, threads, 0 , context.stream()>>> (input_data,
675579 input_channels,
676580 input_height,
677581 input_width,
678582 output_height,
679583 output_width,
680- ksize_height,
681- ksize_width,
682- stride_height,
683- stride_width,
684- padding_height,
685- padding_width,
686- pool_divmods,
687584 pool_process,
688585 exclusive,
689586 output_data,
0 commit comments