-
Couldn't load subscription status.
- Fork 5.9k
Fix xpu2 kp compile error #53548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix xpu2 kp compile error #53548
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,10 +110,29 @@ struct BroadcastDataLoader { | |
| const Array3 &use_broadcast, | ||
| const int block_offset, | ||
| const int num, | ||
| const uint32_t numel) { | ||
| const uint32_t numel, | ||
| int read_lens) { | ||
| using Type = std::tuple_element_t<Index, ArgsT>; | ||
| #ifdef PADDLE_WITH_XPU_KP | ||
| kps::Init<Type, ArgsT, Index, VecSize>( | ||
| args, static_cast<Type>(1.0f), read_lens); | ||
| if (use_broadcast[Index]) { | ||
| kps::ReadDataBc<Type, VecSize, 1, ArgsT, Index, IsBoundary>( | ||
| args, | ||
| reinterpret_cast<const _ptr_ Type *>(ins[Index]), | ||
| block_offset, | ||
| configs[Index], | ||
| numel, | ||
| read_lens); | ||
| } else { | ||
| kps::ReadData<Type, VecSize, 1, ArgsT, Index, IsBoundary>( | ||
| args, | ||
| reinterpret_cast<const _ptr_ Type *>(ins[Index]) + block_offset, | ||
| num, | ||
| read_lens); | ||
| } | ||
| #else | ||
| kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f)); | ||
|
|
||
| if (use_broadcast[Index]) { | ||
| kps::ReadDataBc<Type, VecSize, 1, ArgsT, Index, IsBoundary>( | ||
| args, | ||
|
|
@@ -133,6 +152,7 @@ struct BroadcastDataLoader { | |
| num, | ||
| VecSize); | ||
| } | ||
| #endif | ||
| } | ||
| }; | ||
|
|
||
|
|
@@ -148,7 +168,8 @@ struct BroadcastDataLoader<Index, VecSize, true, kElementwise> { | |
| const Array3 &use_broadcast, | ||
| const int block_offset, | ||
| const int num, | ||
| const uint32_t numel) { | ||
| const uint32_t numel, | ||
| int read_lens) { | ||
| using Type = std::tuple_element_t<Index, ArgsT>; | ||
| int thread_offset = threadIdx.x * VecSize + block_offset; | ||
| #pragma unroll | ||
|
|
@@ -173,7 +194,8 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> { | |
| const Array3 &use_broadcast, | ||
| const int block_offset, | ||
| const int num, | ||
| const uint32_t numel) { | ||
| const uint32_t numel, | ||
| int read_lens) { | ||
| using Type = std::tuple_element_t<Index, ArgsT>; | ||
| using VecType = phi::kps::details::VectorType<Type, VecSize>; | ||
| VecType vec_temp; | ||
|
|
@@ -269,6 +291,10 @@ __device__ void VectorizedBroadcastKernelImpl( | |
| __simd__ ArgsT args[VecSize]; | ||
| __simd__ ConditionalT<OutT, NumOuts> result[VecSize]; | ||
|
|
||
| #ifdef PADDLE_WITH_XPU_KP | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里为什么要单独区分kp There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. XPUKP 在broadcast的功能与GPU是一样的呀 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里之前铭书对GPU的broadcast进行了特化优化(减少了其中重复的fast_divmod计算),这里为了保持其优化效果,就需要单独拿出来 |
||
| BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step( | ||
| ins, args, configs, use_broadcast, block_offset, num, numel, read_lens); | ||
| #else | ||
| if (LoadType == kBroadcast) { | ||
| uint32_t index_bc[Arity][VecSize] = {0}; | ||
| Unroller<BroadcastDataInit, VecSize, Arity>::step(args); | ||
|
|
@@ -291,9 +317,9 @@ __device__ void VectorizedBroadcastKernelImpl( | |
| Unroller<BroadcastDataSetter, VecSize, Arity>::step(ins, args, index_bc); | ||
| } else { | ||
| BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step( | ||
| ins, args, configs, use_broadcast, block_offset, num, numel); | ||
| ins, args, configs, use_broadcast, block_offset, num, numel, read_lens); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. read_lens是给XPU KP 使用的,此处代码已经被else包含为什么还要添加read_lens There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 上一个 comment 中,gpu部分做了特化,但 kp和 gpu 使用的是相同的非特化函数,参数就需要保持一致了 |
||
| } | ||
|
|
||
| #endif | ||
| SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>, | ||
| VecSize, | ||
| Functor, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
89行的vec_size只与out有关吗? 看你修改前的代码是与in/out同时有关的,不确定这里会不会隐藏性能问题
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
参考elementwise也是只取了out的vec_size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
elementwise是因为dim是相同的,而broadcast 输入输出的dim可能是不同的……
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vec_size 原本是取 min( in out 4), 现在是取min( out 4),那应该值是>=之前的值,所以应该不会造成性能下降,有其他原因考虑需要加上吗