@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
133133}
134134} // namespace
135135
136- template <typename T>
137- struct SelectedRowsAddTensor <platform::CUDADeviceContext, T> {
138- void operator ()(const platform::CUDADeviceContext& context,
139- const phi::SelectedRows& input1,
140- const framework::Tensor& input2,
141- framework::Tensor* output) {
142- auto in1_height = input1.height ();
143- auto in2_dims = input2.dims ();
144- auto out_dims = output->dims ();
145- PADDLE_ENFORCE_EQ (
146- in1_height,
147- in2_dims[0 ],
148- platform::errors::InvalidArgument (
149- " The two inputs height must be equal."
150- " But received first input height = [%d], first input height = [%d]" ,
151- in1_height,
152- in2_dims[0 ]));
153- PADDLE_ENFORCE_EQ (
154- in1_height,
155- out_dims[0 ],
156- platform::errors::InvalidArgument (
157- " The input and output height must be equal."
158- " But received input height = [%d], output height = [%d]" ,
159- in1_height,
160- out_dims[0 ]));
161-
162- auto & in1_value = input1.value ();
163- auto & in1_rows = input1.rows ();
164-
165- int64_t in1_row_numel = in1_value.numel () / in1_rows.size ();
166- PADDLE_ENFORCE_EQ (
167- in1_row_numel,
168- input2.numel () / in1_height,
169- platform::errors::InvalidArgument (
170- " The two inputs width must be equal."
171- " But received first input width = [%d], second input width = [%d]" ,
172- in1_row_numel,
173- input2.numel () / in1_height));
174- PADDLE_ENFORCE_EQ (
175- in1_row_numel,
176- output->numel () / in1_height,
177- platform::errors::InvalidArgument (
178- " The input and output width must be equal."
179- " But received input width = [%d], output width = [%d]" ,
180- in1_row_numel,
181- output->numel () / in1_height));
182-
183- auto * in1_data = in1_value.data <T>();
184- auto * in2_data = input2.data <T>();
185- auto * out_data = output->data <T>();
186-
187- phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
188- functor (context, output, static_cast <T>(0 ));
189-
190- const int block_size = 256 ;
191- dim3 threads (block_size, 1 );
192- dim3 grid (in1_rows.size (), 1 );
193- paddle::framework::MixVector<int64_t > mixv_in1_rows (&in1_rows);
194- SelectedRowsAddTensorKernel<T, block_size>
195- <<<grid, threads, 0 , context.stream()>>> (
196- in1_data,
197- mixv_in1_rows.CUDAData (context.GetPlace ()),
198- out_data,
199- in1_row_numel);
200-
201- auto out_eigen = framework::EigenVector<T>::Flatten (*output);
202- auto in2_eigen = framework::EigenVector<T>::Flatten (input2);
203- out_eigen.device (*context.eigen_device ()) = out_eigen + in2_eigen;
204- }
205- };
206-
207136template <typename T>
208137struct SelectedRowsAddTensor <phi::GPUContext, T> {
209138 void operator ()(const phi::GPUContext& context,
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
275204 }
276205};
277206
278- template struct SelectedRowsAddTensor <platform::CUDADeviceContext, float >;
279- template struct SelectedRowsAddTensor <platform::CUDADeviceContext, double >;
280- template struct SelectedRowsAdd <platform::CUDADeviceContext, platform::float16>;
281- template struct SelectedRowsAddTensor <platform::CUDADeviceContext,
282- platform::float16>;
283-
284207template struct SelectedRowsAddTensor <phi::GPUContext, float >;
285208template struct SelectedRowsAddTensor <phi::GPUContext, double >;
286209template struct SelectedRowsAdd <phi::GPUContext, platform::float16>;
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
363286}
364287} // namespace
365288
366- template <typename T>
367- struct SelectedRowsAddToTensor <platform::CUDADeviceContext, T> {
368- void operator ()(const platform::CUDADeviceContext& context,
369- const phi::SelectedRows& input1,
370- framework::Tensor* input2) {
371- auto in1_height = input1.height ();
372- auto in2_dims = input2->dims ();
373- PADDLE_ENFORCE_EQ (
374- in1_height,
375- in2_dims[0 ],
376- platform::errors::InvalidArgument (" The two inputs height must be equal."
377- " But received first input height = "
378- " [%d], second input height = [%d]" ,
379- in1_height,
380- in2_dims[0 ]));
381-
382- auto & in1_value = input1.value ();
383- auto & in1_rows = input1.rows ();
384-
385- int64_t in1_row_numel = in1_value.numel () / in1_rows.size ();
386- PADDLE_ENFORCE_EQ (
387- in1_row_numel,
388- input2->numel () / in1_height,
389- platform::errors::InvalidArgument (
390- " The two inputs width must be equal."
391- " But received first input width = [%d], second input width = [%d]" ,
392- in1_row_numel,
393- input2->numel () / in1_height));
394-
395- auto * in1_data = in1_value.data <T>();
396- auto * in2_data = input2->data <T>();
397- const int block_size = 256 ;
398- dim3 threads (block_size, 1 );
399- dim3 grid (in1_rows.size (), 1 );
400- paddle::framework::MixVector<int64_t > mixv_in1_rows (&in1_rows);
401- SelectedRowsAddToTensorKernel<T, block_size>
402- <<<grid, threads, 0 , context.stream()>>> (
403- in1_data,
404- mixv_in1_rows.CUDAData (context.GetPlace ()),
405- in2_data,
406- in1_row_numel);
407- }
408- };
409-
410289template <typename T>
411290struct SelectedRowsAddToTensor <phi::GPUContext, T> {
412291 void operator ()(const phi::GPUContext& context,
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
451330 }
452331};
453332
454- template struct SelectedRowsAddToTensor <platform::CUDADeviceContext, float >;
455- template struct SelectedRowsAddToTensor <platform::CUDADeviceContext, double >;
456- template struct SelectedRowsAddToTensor <platform::CUDADeviceContext, int >;
457- template struct SelectedRowsAddToTensor <platform::CUDADeviceContext, int64_t >;
458- template struct SelectedRowsAddToTensor <platform::CUDADeviceContext,
459- platform::float16>;
460333template struct SelectedRowsAddToTensor <phi::GPUContext, float >;
461334template struct SelectedRowsAddToTensor <phi::GPUContext, double >;
462335template struct SelectedRowsAddToTensor <phi::GPUContext, int >;
@@ -625,34 +498,6 @@ struct MergeAddImpl {
625498 }
626499};
627500
628- template <typename T>
629- struct MergeAdd <platform::CUDADeviceContext, T> {
630- // unary functor, merge by adding duplicated rows in
631- // the input SelectedRows object.
632- phi::SelectedRows operator ()(const platform::CUDADeviceContext& context,
633- const phi::SelectedRows& input,
634- const bool sorted_result) {
635- return MergeAddImpl<platform::CUDADeviceContext, T>()(
636- context, input, sorted_result);
637- }
638-
639- void operator ()(const platform::CUDADeviceContext& context,
640- const phi::SelectedRows& input,
641- phi::SelectedRows* output,
642- const bool sorted_result) {
643- MergeAddImpl<platform::CUDADeviceContext, T>()(
644- context, input, output, sorted_result);
645- }
646-
647- void operator ()(const platform::CUDADeviceContext& context,
648- const std::vector<const phi::SelectedRows*>& inputs,
649- phi::SelectedRows* output,
650- const bool sorted_result) {
651- MergeAddImpl<platform::CUDADeviceContext, T>()(
652- context, inputs, output, sorted_result);
653- }
654- };
655-
656501template <typename T>
657502struct MergeAdd <phi::GPUContext, T> {
658503 // unary functor, merge by adding duplicated rows in
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
678523 }
679524};
680525
681- #define TEMPLATE_SPECIALIZED_FOR_MERGEADD (dtype ) \
682- template struct MergeAddImpl <platform::CUDADeviceContext, dtype>; \
683- template struct MergeAddImpl <phi::GPUContext, dtype>; \
684- template struct MergeAdd <platform::CUDADeviceContext, dtype>; \
526+ #define TEMPLATE_SPECIALIZED_FOR_MERGEADD (dtype ) \
527+ template struct MergeAddImpl <phi::GPUContext, dtype>; \
685528 template struct MergeAdd <phi::GPUContext, dtype>;
686529
687530TEMPLATE_SPECIALIZED_FOR_MERGEADD (float )
0 commit comments