Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/HCCLTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class NPUEventManager {
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/distributed/collective/NCCLTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,15 @@ class EventManager {
bool DeviceId() const { return device_index_; }
gpuEvent_t GetRawCudaEvent() const { return event_; }

void Record(const paddle::platform::CUDADeviceContext& ctx) {
void Record(const phi::GPUContext& ctx) {
auto device_index = ctx.GetPlace().device;
if (!is_created_) {
CreateEvent(device_index);
}
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
Expand Down Expand Up @@ -157,13 +157,13 @@ class EventManager {
}
}

void Block(const paddle::platform::CUDADeviceContext& ctx) const {
void Block(const phi::GPUContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"CUDADeviceContext's device %d does not match"
"phi::GPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ namespace distributed {

void SyncDefaultStream(
const std::vector<Place>& places,
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) { // NOLINT
std::vector<EventManager>& ncclEvents, // NOLINT
std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) { // NOLINT
for (size_t i = 0; i < places.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places[i]));
ncclEvents[i].Record(*default_ctx);
ncclEvents[i].Block(*dev_ctx[i]);
Expand Down Expand Up @@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(

void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
for (size_t i = 0; i < places_.size(); ++i) {
auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
auto* default_ctx = static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(places_[i]));
default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
}
Expand Down Expand Up @@ -201,15 +201,15 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
<< ", place: " << places_key
<< ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);

std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
dev_ctx.resize(places.size());

PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());

for (size_t i = 0; i < places.size(); ++i) {
platform::CUDADeviceGuard guard(places[i]);
nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
dev_ctx[i].reset(new CUDADeviceContext(places[i]));
dev_ctx[i].reset(new phi::GPUContext(places[i]));
}

PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/distributed/collective/ProcessGroupNCCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ namespace paddle {
namespace distributed {

using Place = paddle::platform::Place;
using CUDADeviceContext = paddle::platform::CUDADeviceContext;

class ProcessGroupNCCL : public ProcessGroup {
public:
Expand Down Expand Up @@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {

std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;

std::unordered_map<std::string,
std::vector<std::unique_ptr<CUDADeviceContext>>>
std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
places_to_ctx_;

std::set<int> used_place_ids_;
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/distributed/collective/reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
void EagerGroup::ConcatTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
ConcatTensorsWithType(
*default_ctx, dense_tensors_, &dense_contents_, dtype_);
Expand All @@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
void EagerGroup::SplitTensors(const platform::Place &place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
auto *default_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
SplitTensorsWithType(
*default_ctx, &dense_contents_, &dense_tensors_, dtype_);
Expand Down Expand Up @@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
if (platform::is_gpu_place(inner_place_)) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
dev_ctx = static_cast<platform::CUDADeviceContext *>(
dev_ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(inner_place_));
#else
PADDLE_THROW(platform::errors::PermissionDenied(
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/distributed/fleet_executor/dist_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
VLOG(3) << "Loading data for GPU.";
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
auto gpu_place = place;
memory::Copy(gpu_place,
static_cast<void *>(input_tensor_ptr),
Expand Down
12 changes: 4 additions & 8 deletions paddle/fluid/distributed/ps/service/brpc_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
char* temp_ptr =
new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(),
temp_ptr,
Expand Down Expand Up @@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
char* temp_ptr =
new char[tensor->numel() *
framework::DataTypeSize(tensor->dtype())]; // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(
platform::CPUPlace(),
temp_ptr,
Expand Down Expand Up @@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
framework::DataTypeSize(tensor->dtype())]; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(place,
tensor_data,
platform::CPUPlace(),
Expand Down Expand Up @@ -311,8 +308,7 @@ void DeserializeSelectedRows(
unsigned long data_len; // NOLINT
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
io_buffer_itr.copy_and_forward(temp_ptr, data_len);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(place,
tensor_data,
platform::CPUPlace(),
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/distributed/ps/service/heter_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
std::vector<char> temp;
temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
char* temp_ptr = temp.data();
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
memory::Copy(platform::CPUPlace(),
temp_ptr,
tensor->place(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(expected_kernel_place));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
if (!dev_ctx) {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Cannot convert device_context to CUDADeviceContext."
"This indicates backend mismatch."
"Pleas double check your expected place"));
}
ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
*dense_tensor.get(),
*dev_ctx,
scale,
bias,
bias_after_scale,
dense_out.get());
ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
*dev_ctx,
scale,
bias,
bias_after_scale,
dense_out.get());
#endif
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/eager/nan_inf_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
auto& place = dense_tensor->place();
if (paddle::platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::framework::details::tensor_check<
paddle::platform::CUDADeviceContext>(
paddle::framework::details::tensor_check<phi::GPUContext>(
api_name, tensor_name, *dense_tensor, place);
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {

paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();
paddle::memory::Copy(place,
mutable_x,
Expand Down Expand Up @@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {

paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();

auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
Expand Down Expand Up @@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();

std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
Expand Down
6 changes: 2 additions & 4 deletions paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();

paddle::memory::Copy(paddle::platform::CPUPlace(),
Expand Down Expand Up @@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
if (place == paddle::platform::CUDAPlace()) {
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx =
dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
auto stream = dev_ctx->stream();

paddle::memory::Copy(paddle::platform::CPUPlace(),
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/eager/tests/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(paddle::platform::CUDAPlace()));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream();

paddle::memory::Copy(paddle::platform::CPUPlace(),
Expand Down Expand Up @@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
#ifdef PADDLE_WITH_CUDA
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
pool.Get(paddle::platform::CUDAPlace()));
auto* dev_ctx =
dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
auto stream = dev_ctx->stream();

paddle::memory::Copy(paddle::platform::CPUPlace(),
Expand Down
5 changes: 2 additions & 3 deletions paddle/fluid/framework/data_device_transform_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(test_op,
paddle::framework::TestKernel<phi::CPUContext, float>);
REGISTER_OP_CUDA_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(test_op,
paddle::framework::TestKernel<phi::GPUContext, float>);

static void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments,
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/data_feed.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2797,7 +2797,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
const std::vector<UsedSlotInfo>& infos) {
place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>(
stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();

Expand Down Expand Up @@ -2831,7 +2831,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}

void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
place_ = place;
stream_ = dynamic_cast<platform::CUDADeviceContext*>(
stream_ = dynamic_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
ins_num_ = 0;
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/data_feed.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
const int float_slot_size,
const UsedSlotGpuType *used_slots) {
auto stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream();
FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
Expand Down Expand Up @@ -159,7 +159,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
const int float_slot_size,
const UsedSlotGpuType *used_slots) {
auto stream =
dynamic_cast<platform::CUDADeviceContext *>(
dynamic_cast<phi::GPUContext *>(
paddle::platform::DeviceContextPool::Instance().Get(this->place_))
->stream();

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/data_type_transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ struct CastDataType {
CastDataTypeFunctor<InType, OutType>());
#if defined(__NVCC__) || defined(__HIPCC__)
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
platform::Transform<phi::GPUContext> trans;
auto* context = static_cast<const phi::GPUContext*>(ctx_);
trans(*context,
in_begin,
in_end,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/data_type_transform_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ limitations under the License. */
TEST(DataTypeTransform, GPUTransform) {
auto cpu_place = paddle::platform::CPUPlace();
auto gpu_place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(gpu_place);
phi::GPUContext context(gpu_place);
context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(gpu_place, context.stream())
.get());
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/broadcast_op_handle_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
for (int i = 0; i < count; ++i) {
auto p = p::CUDAPlace(i);
place_list_.push_back(p);
ctxs_.emplace_back(new p::CUDADeviceContext(p));
ctxs_.emplace_back(new phi::GPUContext(p));
}
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/eager_deletion_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
gc_(gc) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) {
dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place));
if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard(place.device);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/eager_deletion_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
GarbageCollector *gc_; // not own
std::vector<Variable *> vars_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDADeviceContext *dev_ctx_{nullptr};
phi::GPUContext *dev_ctx_{nullptr};
gpuEvent_t event_{nullptr};
#endif
};
Expand Down
Loading