Skip to content

Commit

Permalink
fix multi stream error. (#45196)
Browse files Browse the repository at this point in the history
* fix multi stream error.
  • Loading branch information
jiweibo authored Aug 17, 2022
1 parent f59c666 commit a79d4a7
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 20 deletions.
6 changes: 4 additions & 2 deletions paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include <thread>

#include "paddle/fluid/platform/profiler/event_tracing.h"

Expand Down Expand Up @@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_,
this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
<< allocation->ptr() << " , stream: " << default_stream_;
VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
<< allocation->size() << " bytes at address " << allocation->ptr()
<< " , stream: " << default_stream_;
return allocation;
}

Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/operators/fc_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
auto w_dims = w->dims();
bool padding_weights = ctx.Attr<bool>("padding_weights");

auto& dev_ctx = ctx.template device_context<DeviceContext>();

std::vector<int64_t> output_dims;
FCOutputSize(
input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
Expand All @@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {

const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto* output_data =
dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));

auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx,
M,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <paddle/fluid/platform/device_context.h>

#include <algorithm>
#include <cstdint>
#include <type_traits>

#include "paddle/fluid/framework/convert_utils.h"
Expand Down Expand Up @@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
#else
cudaGetDevice(&device_id);
#endif

auto &dev_ctx = context.template device_context<phi::GPUContext>();

in_ids_.Resize(in_dim);
in_embs_.Resize(in_dim);
int64_t *in_ids_d =
in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *in_embs_d =
in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));

int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
&in_ids_, in_ids_.numel() * sizeof(int64_t));
int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
&in_embs_, in_embs_.numel() * sizeof(int64_t));

std::vector<int64_t> in1s, in2s;
for (int i = 0; i < input_num; ++i) {
Expand Down Expand Up @@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {

auto *bias_d = bias->data<T>();
auto *scale_d = scale->data<T>();
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

float eps = context.Attr<float>("epsilon");

if (std::is_same<T, paddle::platform::float16>::value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {

const T* x_data = x->data<T>();
const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());

auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.GEMM(false,
false,
Expand Down Expand Up @@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto* mean = ctx.Output<framework::Tensor>("Mean");
auto* variance = ctx.Output<framework::Tensor>("Variance");

T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* variance_data =
variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* mean_data =
mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
: nullptr;
T* variance_data = variance ? dev_ctx.template Alloc<T>(
variance, variance->numel() * sizeof(T))
: nullptr;

bool with_relu =
(ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
Expand Down
17 changes: 11 additions & 6 deletions paddle/fluid/operators/fused/multihead_matmul_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
int grid = batch * head_number * seq_len;
int block = round_up(seq_len);
broadcast<<<grid, block, 0, stream>>>(
Expand All @@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
if (!bias_qk) {
int size = batch * head_number * seq_len * seq_len;
temp_bias_tensor.Resize({size});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
#ifdef PADDLE_WITH_HIP
hipMemset(temp_qk_bias, 0, sizeof(float) * size);
#else
Expand All @@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {

auto *out = context.Output<framework::Tensor>("Out");
out->Resize({batch, seq_len, all_head_size});
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d =
device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

// (B*S, hidden)
const Tensor input_matrix =
Expand All @@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
phi::make_ddim({batch, seq_len, 3, head_number, head_size});
temp_out_tensor.Resize(
{batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
auto *temp_out_data = device_ctx.template Alloc<T>(
&temp_out_tensor, temp_out_tensor.numel() * sizeof(T));

// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
Expand All @@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// B * head_number * S * S * 1 + B * S * 3 * N * H
int scratch_size = batch * head_number * seq_len * seq_len * 1;
multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
auto *multihead_temp_data =
multihead_temp_tensor.mutable_data<T>(context.GetPlace());
auto *multihead_temp_data = device_ctx.template Alloc<T>(
&multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));

auto *qkptr = multihead_temp_data;
auto *tptr = multihead_temp_data + scratch_size;

Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/operators/matmul_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
auto &y = GET_DATA_SAFELY(
context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());

auto &dev_ctx = context.template device_context<DeviceContext>();
dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
Expand Down

0 comments on commit a79d4a7

Please sign in to comment.