Optimizing the performance of fused_layer_norm and top_p_sampling operators (#65711)

yuanlehome · web-flow · commit a14bb2fee9e5 · 2024-07-06T00:07:48.000+08:00
* optim fused_layer_norm and top_p_sampling

* update

* update

* update

* support hip

* fix comment

* update
diff --git a/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu b/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
@@ -65,4 +65,7 @@ PD_REGISTER_KERNEL(blha_get_max_len,
                    ALL_LAYOUT,
                    phi::fusion::BlhaGetMaxLenKernel,
                    int,
-                   int64_t) {}
+                   int64_t) {
+  kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
+  kernel->OutputAt(1).SetBackend(phi::Backend::CPU);
+}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -537,7 +537,7 @@ inline GPU(Error_t)
   // Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by
   // RichardWooSJTU.
 
-  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_1 = 512;
 
   int dev = 0;
   {
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu

Original file line number	Diff line number	Diff line change
`@@ -537,7 +537,7 @@ inline GPU(Error_t)`
`537`	`537`	`// Note(Zhengzekang): We choose a fixed blocksize to avoid layernorm diff, by`
`538`	`538`	`// RichardWooSJTU.`
`539`	`539`
`540`		`- constexpr int block_size_conf_1 = 128;`
	`540`	`+ constexpr int block_size_conf_1 = 512;`
`541`	`541`
`542`	`542`	`int dev = 0;`
`543`	`543`	`{`