Skip to content

Commit 8849056

Browse files
authored
unify fluid::CUDADeviceContext and phi::GpuContext (#44723)
* remove cudaDeviceContext * remove more template * fix rocm compile
1 parent 0a2db7c commit 8849056

26 files changed

+122
-2801
lines changed

paddle/fluid/framework/details/eager_deletion_op_handle.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,6 @@
2323
#include "paddle/fluid/framework/details/op_handle_base.h"
2424
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
2525

26-
namespace paddle {
27-
namespace platform {
28-
class CUDADeviceContext;
29-
} // namespace platform
30-
} // namespace paddle
31-
3226
namespace paddle {
3327
namespace framework {
3428
class GarbageCollector;

paddle/fluid/memory/allocation/cuda_device_context_allocator.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,6 @@
2525
#include "paddle/fluid/platform/place.h"
2626

2727
namespace paddle {
28-
29-
namespace platform {
30-
class CUDADeviceContext;
31-
} // namespace platform
32-
3328
namespace memory {
3429
namespace allocation {
3530

paddle/fluid/operators/cudnn_lstm_op.cu.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,6 @@ limitations under the License. */
2323
#include "paddle/fluid/operators/miopen_lstm_cache.h"
2424
#endif
2525

26-
namespace paddle {
27-
namespace platform {
28-
class CUDADeviceContext;
29-
30-
} // namespace platform
31-
} // namespace paddle
32-
3326
namespace paddle {
3427
namespace operators {
3528

paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
182182
#endif
183183

184184
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
185-
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
185+
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
186186
// first sum pool
187187
FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
188188
config.thread_per_block.x,
@@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
209209
// not need show click input
210210
N = static_cast<size_t>(batch_size * slot_num *
211211
(embedding_size - cvm_offset));
212-
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
212+
platform::GpuLaunchConfig config =
213+
platform::GetGpuLaunchConfig1D(dev_ctx, N);
213214
FusedCVMKernelNoCVM<<<config.block_per_grid.x,
214215
config.thread_per_block.x,
215216
0,
@@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
391392
#endif
392393

393394
size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
394-
auto config = GetGpuLaunchConfig1D(dev_ctx, N);
395+
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
395396
if (use_cvm) {
396397
// join grad
397398
FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,

paddle/fluid/operators/gru_op.cu.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@ limitations under the License. */
1414

1515
#include "paddle/fluid/operators/gru_op.h"
1616

17-
namespace paddle {
18-
namespace platform {
19-
class CUDADeviceContext;
20-
21-
} // namespace platform
22-
} // namespace paddle
23-
2417
namespace paddle {
2518
namespace operators {
2619

paddle/fluid/operators/math/cross_entropy.cu

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
150150
}
151151
}
152152

153-
template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
154-
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
155-
template class CrossEntropyFunctor<platform::CUDADeviceContext,
156-
platform::float16>;
157-
158153
template class CrossEntropyFunctor<phi::GPUContext, float>;
159154
template class CrossEntropyFunctor<phi::GPUContext, double>;
160155
template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;

paddle/fluid/operators/math/im2col.cu

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
308308
}
309309
};
310310

311-
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
312-
platform::CUDADeviceContext,
313-
float>;
314-
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
315-
platform::CUDADeviceContext,
316-
double>;
317311
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
318312
phi::GPUContext,
319313
float>;
320314
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
321315
phi::GPUContext,
322316
double>;
323-
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
324-
platform::CUDADeviceContext,
325-
float>;
326-
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
327-
platform::CUDADeviceContext,
328-
double>;
329317
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
330318
phi::GPUContext,
331319
float>;
@@ -576,25 +564,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
576564
}
577565
};
578566

579-
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
580-
platform::CUDADeviceContext,
581-
float>;
582-
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
583-
platform::CUDADeviceContext,
584-
double>;
585567
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
586568
phi::GPUContext,
587569
float>;
588570
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
589571
phi::GPUContext,
590572
double>;
591573

592-
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
593-
platform::CUDADeviceContext,
594-
float>;
595-
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
596-
platform::CUDADeviceContext,
597-
double>;
598574
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
599575
phi::GPUContext,
600576
float>;

paddle/fluid/operators/math/maxouting.cu

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
173173
axis);
174174
}
175175

176-
template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
177-
template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
178-
179-
template class MaxOutFunctor<platform::CUDADeviceContext, float>;
180-
template class MaxOutFunctor<platform::CUDADeviceContext, double>;
181-
182176
template class MaxOutGradFunctor<phi::GPUContext, float>;
183177
template class MaxOutGradFunctor<phi::GPUContext, double>;
184178

paddle/fluid/operators/math/sample_prob.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,6 @@ limitations under the License. */
2222
#include "paddle/fluid/operators/math/sampler.h"
2323
#include "paddle/phi/core/ddim.h"
2424

25-
namespace paddle {
26-
namespace platform {
27-
class CUDADeviceContext;
28-
} // namespace platform
29-
} // namespace paddle
30-
3125
namespace paddle {
3226
namespace operators {
3327
namespace math {

paddle/fluid/operators/math/selected_rows_functor.cu

Lines changed: 2 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
133133
}
134134
} // namespace
135135

136-
template <typename T>
137-
struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
138-
void operator()(const platform::CUDADeviceContext& context,
139-
const phi::SelectedRows& input1,
140-
const framework::Tensor& input2,
141-
framework::Tensor* output) {
142-
auto in1_height = input1.height();
143-
auto in2_dims = input2.dims();
144-
auto out_dims = output->dims();
145-
PADDLE_ENFORCE_EQ(
146-
in1_height,
147-
in2_dims[0],
148-
platform::errors::InvalidArgument(
149-
"The two inputs height must be equal."
150-
"But received first input height = [%d], first input height = [%d]",
151-
in1_height,
152-
in2_dims[0]));
153-
PADDLE_ENFORCE_EQ(
154-
in1_height,
155-
out_dims[0],
156-
platform::errors::InvalidArgument(
157-
"The input and output height must be equal."
158-
"But received input height = [%d], output height = [%d]",
159-
in1_height,
160-
out_dims[0]));
161-
162-
auto& in1_value = input1.value();
163-
auto& in1_rows = input1.rows();
164-
165-
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
166-
PADDLE_ENFORCE_EQ(
167-
in1_row_numel,
168-
input2.numel() / in1_height,
169-
platform::errors::InvalidArgument(
170-
"The two inputs width must be equal."
171-
"But received first input width = [%d], second input width = [%d]",
172-
in1_row_numel,
173-
input2.numel() / in1_height));
174-
PADDLE_ENFORCE_EQ(
175-
in1_row_numel,
176-
output->numel() / in1_height,
177-
platform::errors::InvalidArgument(
178-
"The input and output width must be equal."
179-
"But received input width = [%d], output width = [%d]",
180-
in1_row_numel,
181-
output->numel() / in1_height));
182-
183-
auto* in1_data = in1_value.data<T>();
184-
auto* in2_data = input2.data<T>();
185-
auto* out_data = output->data<T>();
186-
187-
phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
188-
functor(context, output, static_cast<T>(0));
189-
190-
const int block_size = 256;
191-
dim3 threads(block_size, 1);
192-
dim3 grid(in1_rows.size(), 1);
193-
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
194-
SelectedRowsAddTensorKernel<T, block_size>
195-
<<<grid, threads, 0, context.stream()>>>(
196-
in1_data,
197-
mixv_in1_rows.CUDAData(context.GetPlace()),
198-
out_data,
199-
in1_row_numel);
200-
201-
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
202-
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
203-
out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
204-
}
205-
};
206-
207136
template <typename T>
208137
struct SelectedRowsAddTensor<phi::GPUContext, T> {
209138
void operator()(const phi::GPUContext& context,
@@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
275204
}
276205
};
277206

278-
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
279-
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
280-
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
281-
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
282-
platform::float16>;
283-
284207
template struct SelectedRowsAddTensor<phi::GPUContext, float>;
285208
template struct SelectedRowsAddTensor<phi::GPUContext, double>;
286209
template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
@@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
363286
}
364287
} // namespace
365288

366-
template <typename T>
367-
struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
368-
void operator()(const platform::CUDADeviceContext& context,
369-
const phi::SelectedRows& input1,
370-
framework::Tensor* input2) {
371-
auto in1_height = input1.height();
372-
auto in2_dims = input2->dims();
373-
PADDLE_ENFORCE_EQ(
374-
in1_height,
375-
in2_dims[0],
376-
platform::errors::InvalidArgument("The two inputs height must be equal."
377-
"But received first input height = "
378-
"[%d], second input height = [%d]",
379-
in1_height,
380-
in2_dims[0]));
381-
382-
auto& in1_value = input1.value();
383-
auto& in1_rows = input1.rows();
384-
385-
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
386-
PADDLE_ENFORCE_EQ(
387-
in1_row_numel,
388-
input2->numel() / in1_height,
389-
platform::errors::InvalidArgument(
390-
"The two inputs width must be equal."
391-
"But received first input width = [%d], second input width = [%d]",
392-
in1_row_numel,
393-
input2->numel() / in1_height));
394-
395-
auto* in1_data = in1_value.data<T>();
396-
auto* in2_data = input2->data<T>();
397-
const int block_size = 256;
398-
dim3 threads(block_size, 1);
399-
dim3 grid(in1_rows.size(), 1);
400-
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
401-
SelectedRowsAddToTensorKernel<T, block_size>
402-
<<<grid, threads, 0, context.stream()>>>(
403-
in1_data,
404-
mixv_in1_rows.CUDAData(context.GetPlace()),
405-
in2_data,
406-
in1_row_numel);
407-
}
408-
};
409-
410289
template <typename T>
411290
struct SelectedRowsAddToTensor<phi::GPUContext, T> {
412291
void operator()(const phi::GPUContext& context,
@@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
451330
}
452331
};
453332

454-
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
455-
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
456-
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
457-
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
458-
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
459-
platform::float16>;
460333
template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
461334
template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
462335
template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
@@ -625,34 +498,6 @@ struct MergeAddImpl {
625498
}
626499
};
627500

628-
template <typename T>
629-
struct MergeAdd<platform::CUDADeviceContext, T> {
630-
// unary functor, merge by adding duplicated rows in
631-
// the input SelectedRows object.
632-
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
633-
const phi::SelectedRows& input,
634-
const bool sorted_result) {
635-
return MergeAddImpl<platform::CUDADeviceContext, T>()(
636-
context, input, sorted_result);
637-
}
638-
639-
void operator()(const platform::CUDADeviceContext& context,
640-
const phi::SelectedRows& input,
641-
phi::SelectedRows* output,
642-
const bool sorted_result) {
643-
MergeAddImpl<platform::CUDADeviceContext, T>()(
644-
context, input, output, sorted_result);
645-
}
646-
647-
void operator()(const platform::CUDADeviceContext& context,
648-
const std::vector<const phi::SelectedRows*>& inputs,
649-
phi::SelectedRows* output,
650-
const bool sorted_result) {
651-
MergeAddImpl<platform::CUDADeviceContext, T>()(
652-
context, inputs, output, sorted_result);
653-
}
654-
};
655-
656501
template <typename T>
657502
struct MergeAdd<phi::GPUContext, T> {
658503
// unary functor, merge by adding duplicated rows in
@@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
678523
}
679524
};
680525

681-
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
682-
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
683-
template struct MergeAddImpl<phi::GPUContext, dtype>; \
684-
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
526+
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
527+
template struct MergeAddImpl<phi::GPUContext, dtype>; \
685528
template struct MergeAdd<phi::GPUContext, dtype>;
686529

687530
TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)

0 commit comments

Comments
 (0)