diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index d540f907a6445..b62ba99df7d74 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" +#include #include "paddle/fluid/platform/profiler/event_tracing.h" @@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { static_unique_ptr_cast(std::move(underlying_allocation)), default_stream_, this); - VLOG(8) << "Allocate " << allocation->size() << " bytes at address " - << allocation->ptr() << " , stream: " << default_stream_; + VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate " + << allocation->size() << " bytes at address " << allocation->ptr() + << " , stream: " << default_stream_; return allocation; } diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index 52b2a98a9daaa..24380b29ee125 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel { auto w_dims = w->dims(); bool padding_weights = ctx.Attr("padding_weights"); + auto& dev_ctx = ctx.template device_context(); + std::vector output_dims; FCOutputSize( input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights); @@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel { const T* input_data = input->data(); const T* w_data = w->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); + auto* output_data = + dev_ctx.template Alloc(output, output->numel() * sizeof(T)); - auto& dev_ctx = ctx.template device_context(); phi::funcs::FCFunctor fc; fc(dev_ctx, M, diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index ac09d337e9bb4..75e131b2deb34 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -15,6 +15,7 @@ #include #include +#include #include #include "paddle/fluid/framework/convert_utils.h" @@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { #else cudaGetDevice(&device_id); #endif + + auto &dev_ctx = context.template device_context(); + in_ids_.Resize(in_dim); in_embs_.Resize(in_dim); - int64_t *in_ids_d = - in_ids_.mutable_data(platform::CUDAPlace(device_id)); - int64_t *in_embs_d = - in_embs_.mutable_data(platform::CUDAPlace(device_id)); + + int64_t *in_ids_d = dev_ctx.template Alloc( + &in_ids_, in_ids_.numel() * sizeof(int64_t)); + int64_t *in_embs_d = dev_ctx.template Alloc( + &in_embs_, in_embs_.numel() * sizeof(int64_t)); std::vector in1s, in2s; for (int i = 0; i < input_num; ++i) { @@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { auto *bias_d = bias->data(); auto *scale_d = scale->data(); - auto *output_d = out->mutable_data(context.GetPlace()); + auto *output_d = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + float eps = context.Attr("epsilon"); if (std::is_same::value) { diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 3e117c45359b1..758fb8a23f8f9 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { const T* x_data = x->data(); const T* w_data = w->data(); - T* out_data = out->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); + auto* out_data = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.GEMM(false, false, @@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { auto* mean = ctx.Output("Mean"); auto* variance = ctx.Output("Variance"); - T* mean_data = mean ? mean->mutable_data(ctx.GetPlace()) : nullptr; - T* variance_data = - variance ? variance->mutable_data(ctx.GetPlace()) : nullptr; + T* mean_data = + mean ? dev_ctx.template Alloc(mean, mean->numel() * sizeof(T)) + : nullptr; + T* variance_data = variance ? dev_ctx.template Alloc( + variance, variance->numel() * sizeof(T)) + : nullptr; bool with_relu = (ctx.Attr("activation_type") == "relu") ? true : false; diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index 16ab0d916d914..c2e2754830bbd 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted if (bias_qk && bias_qk->numel() == (batch * seq_len)) { temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); - auto *temp_qk_bias = temp_bias_tensor.mutable_data(context.GetPlace()); + auto *temp_qk_bias = device_ctx.template Alloc( + &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); int grid = batch * head_number * seq_len; int block = round_up(seq_len); broadcast<<>>( @@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { if (!bias_qk) { int size = batch * head_number * seq_len * seq_len; temp_bias_tensor.Resize({size}); - auto *temp_qk_bias = temp_bias_tensor.mutable_data(context.GetPlace()); + auto *temp_qk_bias = device_ctx.template Alloc( + &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); #ifdef PADDLE_WITH_HIP hipMemset(temp_qk_bias, 0, sizeof(float) * size); #else @@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { auto *out = context.Output("Out"); out->Resize({batch, seq_len, all_head_size}); - auto *output_d = out->mutable_data(context.GetPlace()); + auto *output_d = + device_ctx.template Alloc(out, out->numel() * sizeof(T)); // (B*S, hidden) const Tensor input_matrix = @@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { phi::make_ddim({batch, seq_len, 3, head_number, head_size}); temp_out_tensor.Resize( {batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)}); - auto *temp_out_data = temp_out_tensor.mutable_data(context.GetPlace()); + auto *temp_out_data = device_ctx.template Alloc( + &temp_out_tensor, temp_out_tensor.numel() * sizeof(T)); // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) auto blas = phi::funcs::GetBlas(device_ctx); @@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { // B * head_number * S * S * 1 + B * S * 3 * N * H int scratch_size = batch * head_number * seq_len * seq_len * 1; multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()}); - auto *multihead_temp_data = - multihead_temp_tensor.mutable_data(context.GetPlace()); + auto *multihead_temp_data = device_ctx.template Alloc( + &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T)); + auto *qkptr = multihead_temp_data; auto *tptr = multihead_temp_data + scratch_size; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 80018ddb1c9c2..a31c218307b9c 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel { auto &y = GET_DATA_SAFELY( context.Input("Y"), "Input", "Y", "MatMul"); auto *out = context.Output("Out"); - out->mutable_data(context.GetPlace()); + + auto &dev_ctx = context.template device_context(); + dev_ctx.template Alloc(out, out->numel() * sizeof(T)); auto blas = phi::funcs::GetBlas(context); auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(