PaddlePaddle · zhiqiu · Aug 1, 2022 · Jul 29, 2022 · Jul 29, 2022 · Jul 29, 2022
@@ -94,7 +94,7 @@ class NPUEventManager {
       PADDLE_ENFORCE_EQ(device_index,
                         device_index_,
                         platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                             "Event's device %d",
                             device_index,
                             device_index_));

@@ -104,15 +104,15 @@ class EventManager {
   bool DeviceId() const { return device_index_; }
   gpuEvent_t GetRawCudaEvent() const { return event_; }
 
-  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+  void Record(const phi::GPUContext& ctx) {
     auto device_index = ctx.GetPlace().device;
     if (!is_created_) {
       CreateEvent(device_index);
     }
     PADDLE_ENFORCE_EQ(device_index,
                       device_index_,
                       platform::errors::PreconditionNotMet(
-                          "CUDADeviceContext's device %d does not match"
+                          "phi::GPUContext's device %d does not match"
                           "Event's device %d",
                           device_index,
                           device_index_));
@@ -157,13 +157,13 @@ class EventManager {
     }
   }
 
-  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+  void Block(const phi::GPUContext& ctx) const {
     if (is_created_) {
       auto device_index = ctx.GetPlace().device;
       PADDLE_ENFORCE_EQ(device_index,
                         device_index_,
                         platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                             "Event's device %d",
                             device_index,
                             device_index_));

@@ -31,10 +31,10 @@ namespace distributed {
 
 void SyncDefaultStream(
     const std::vector<Place>& places,
-    std::vector<EventManager>& ncclEvents,                       // NOLINT
-    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+    std::vector<EventManager>& ncclEvents,                     // NOLINT
+    std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) {  // NOLINT
   for (size_t i = 0; i < places.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(places[i]));
     ncclEvents[i].Record(*default_ctx);
     ncclEvents[i].Block(*dev_ctx[i]);
@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(
 
 void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(places_[i]));
     default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
   }
@@ -201,15 +201,15 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
           << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
-  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
   dev_ctx.resize(places.size());
 
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
 
   for (size_t i = 0; i < places.size(); ++i) {
     platform::CUDADeviceGuard guard(places[i]);
     nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
-    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+    dev_ctx[i].reset(new phi::GPUContext(places[i]));
   }
 
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());

@@ -45,7 +45,6 @@ namespace paddle {
 namespace distributed {
 
 using Place = paddle::platform::Place;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 class ProcessGroupNCCL : public ProcessGroup {
  public:
@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
 
-  std::unordered_map<std::string,
-                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+  std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
       places_to_ctx_;
 
   std::set<int> used_place_ids_;

@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
 void EagerGroup::ConcatTensors(const platform::Place &place) {
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     ConcatTensorsWithType(
         *default_ctx, dense_tensors_, &dense_contents_, dtype_);
@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
 void EagerGroup::SplitTensors(const platform::Place &place) {
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     SplitTensorsWithType(
         *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
   if (platform::is_gpu_place(inner_place_)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    dev_ctx = static_cast<platform::CUDADeviceContext *>(
+    dev_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(inner_place_));
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
     VLOG(3) << "Loading data for GPU.";
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx =
-        dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
     memory::Copy(gpu_place,
                  static_cast<void *>(input_tensor_ptr),

diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
     char* temp_ptr =
         new char[tensor->numel() *
                  framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         platform::CPUPlace(),
         temp_ptr,
@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
     char* temp_ptr =
         new char[tensor->numel() *
                  framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         platform::CPUPlace(),
         temp_ptr,
@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
                  framework::DataTypeSize(tensor->dtype())];     // NOLINT
     io_buffer_itr.copy_and_forward((void*)(&data_len), 8);      // NOLINT
     io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(place,
                  tensor_data,
                  platform::CPUPlace(),
@@ -311,8 +308,7 @@ void DeserializeSelectedRows(
     unsigned long data_len;                                  // NOLINT
     io_buffer_itr.copy_and_forward((void*)(&data_len), 8);   // NOLINT
     io_buffer_itr.copy_and_forward(temp_ptr, data_len);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(place,
                  tensor_data,
                  platform::CPUPlace(),

diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
     std::vector<char> temp;
     temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
     char* temp_ptr = temp.data();
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(platform::CPUPlace(),
                  temp_ptr,
                  tensor->place(),

diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(expected_kernel_place));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
     if (!dev_ctx) {
       PADDLE_THROW(paddle::platform::errors::Fatal(
           "Cannot convert device_context to CUDADeviceContext."
           "This indicates backend mismatch."
           "Pleas double check your expected place"));
     }
-    ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
-        *dense_tensor.get(),
-        *dev_ctx,
-        scale,
-        bias,
-        bias_after_scale,
-        dense_out.get());
+    ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
+                                         *dev_ctx,
+                                         scale,
+                                         bias,
+                                         bias_after_scale,
+                                         dense_out.get());
 #endif
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(

diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::framework::details::tensor_check<
-          paddle::platform::CUDADeviceContext>(
+      paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {
 
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
     paddle::memory::Copy(place,
                          mutable_x,
@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
 
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),

diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h
@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),

diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
     paddle::framework::OpKernelTestProtoAndCheckerMaker);
 REGISTER_OP_CPU_KERNEL(test_op,
                        paddle::framework::TestKernel<phi::CPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(test_op,
+                        paddle::framework::TestKernel<phi::GPUContext, float>);
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -2797,7 +2797,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
 MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
                                    const std::vector<UsedSlotInfo>& infos) {
   place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                 platform::DeviceContextPool::Instance().Get(place))
                 ->stream();
 
@@ -2831,7 +2831,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}
 
 void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
   place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                 platform::DeviceContextPool::Instance().Get(place))
                 ->stream();
   ins_num_ = 0;

diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
@@ -80,7 +80,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
     const int float_slot_size,
     const UsedSlotGpuType *used_slots) {
   auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(this->place_))
           ->stream();
   FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
@@ -159,7 +159,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
     const int float_slot_size,
     const UsedSlotGpuType *used_slots) {
   auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(this->place_))
           ->stream();
 

diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
@@ -103,8 +103,8 @@ struct CastDataType {
             CastDataTypeFunctor<InType, OutType>());
 #if defined(__NVCC__) || defined(__HIPCC__)
     } else if (platform::is_gpu_place(in_.place())) {
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      platform::Transform<phi::GPUContext> trans;
+      auto* context = static_cast<const phi::GPUContext*>(ctx_);
       trans(*context,
             in_begin,
             in_end,

diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());

diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CUDAPlace(i);
         place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new phi::GPUContext(p));
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       gc_(gc) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
-    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+    dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(place.device);

diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
 };