Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix multi stream error. #45196

Merged
merged 3 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include <thread>

#include "paddle/fluid/platform/profiler/event_tracing.h"

Expand Down Expand Up @@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_,
this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
<< allocation->ptr() << " , stream: " << default_stream_;
VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
<< allocation->size() << " bytes at address " << allocation->ptr()
<< " , stream: " << default_stream_;
return allocation;
}

Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/operators/fc_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
auto w_dims = w->dims();
bool padding_weights = ctx.Attr<bool>("padding_weights");

auto& dev_ctx = ctx.template device_context<DeviceContext>();

std::vector<int64_t> output_dims;
FCOutputSize(
input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
Expand All @@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {

const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto* output_data =
dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));

auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx,
M,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <paddle/fluid/platform/device_context.h>

#include <algorithm>
#include <cstdint>
#include <type_traits>

#include "paddle/fluid/framework/convert_utils.h"
Expand Down Expand Up @@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
#else
cudaGetDevice(&device_id);
#endif

auto &dev_ctx = context.template device_context<phi::GPUContext>();

in_ids_.Resize(in_dim);
in_embs_.Resize(in_dim);
int64_t *in_ids_d =
in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *in_embs_d =
in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));

int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
&in_ids_, in_ids_.numel() * sizeof(int64_t));
int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
&in_embs_, in_embs_.numel() * sizeof(int64_t));

std::vector<int64_t> in1s, in2s;
for (int i = 0; i < input_num; ++i) {
Expand Down Expand Up @@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {

auto *bias_d = bias->data<T>();
auto *scale_d = scale->data<T>();
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

float eps = context.Attr<float>("epsilon");

if (std::is_same<T, paddle::platform::float16>::value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {

const T* x_data = x->data<T>();
const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());

auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.GEMM(false,
false,
Expand Down Expand Up @@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto* mean = ctx.Output<framework::Tensor>("Mean");
auto* variance = ctx.Output<framework::Tensor>("Variance");

T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* variance_data =
variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* mean_data =
mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
: nullptr;
T* variance_data = variance ? dev_ctx.template Alloc<T>(
variance, variance->numel() * sizeof(T))
: nullptr;

bool with_relu =
(ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
Expand Down
17 changes: 11 additions & 6 deletions paddle/fluid/operators/fused/multihead_matmul_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
int grid = batch * head_number * seq_len;
int block = round_up(seq_len);
broadcast<<<grid, block, 0, stream>>>(
Expand All @@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
if (!bias_qk) {
int size = batch * head_number * seq_len * seq_len;
temp_bias_tensor.Resize({size});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
#ifdef PADDLE_WITH_HIP
hipMemset(temp_qk_bias, 0, sizeof(float) * size);
#else
Expand All @@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {

auto *out = context.Output<framework::Tensor>("Out");
out->Resize({batch, seq_len, all_head_size});
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d =
device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

// (B*S, hidden)
const Tensor input_matrix =
Expand All @@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
phi::make_ddim({batch, seq_len, 3, head_number, head_size});
temp_out_tensor.Resize(
{batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
auto *temp_out_data = device_ctx.template Alloc<T>(
&temp_out_tensor, temp_out_tensor.numel() * sizeof(T));

// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
Expand All @@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// B * head_number * S * S * 1 + B * S * 3 * N * H
int scratch_size = batch * head_number * seq_len * seq_len * 1;
multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
auto *multihead_temp_data =
multihead_temp_tensor.mutable_data<T>(context.GetPlace());
auto *multihead_temp_data = device_ctx.template Alloc<T>(
&multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));

auto *qkptr = multihead_temp_data;
auto *tptr = multihead_temp_data + scratch_size;

Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/operators/matmul_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
auto &y = GET_DATA_SAFELY(
context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());

auto &dev_ctx = context.template device_context<DeviceContext>();
dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));

auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
Expand Down