Optimizing the performance of fused_layer_norm and top_p_sampling ope…

…rators (PaddlePaddle#65711) * optim fused_layer_norm and top_p_sampling * update * update * update * support hip * fix comment * update
changeyoung98 · Jul 5, 2024 · a14bb2f · a14bb2f
1 parent 49772bc
commit a14bb2f
Show file tree

Hide file tree

Showing 3 changed files with 188 additions and 410 deletions.
diff --git a/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu b/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
@@ -65,4 +65,7 @@ PD_REGISTER_KERNEL(blha_get_max_len,
                    ALL_LAYOUT,
                    phi::fusion::BlhaGetMaxLenKernel,
                    int,
-                   int64_t) {}
+                   int64_t) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+  kernel->OutputAt(1).SetBackend(phi::Backend::CPU);
+}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -537,7 +537,7 @@ inline GPU(Error_t)
   // Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by
   // RichardWooSJTU.
 
-  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_1 = 512;
 
   int dev = 0;
   {