Review: refactor switch statement, change cross_entropy to use full size

am17an · am17an · commit 4c7bcaabaf91 · 2025-07-02T17:19:19.000+08:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -187,7 +187,7 @@ static const char * cu_get_error_str(CUresult err) {
     } while (0)
 #else
 #define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) do {} while (0)
-#endif
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
 
 #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
 #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -169,7 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
     const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
 
     if (nbytes_shared <= smpbo) {
-        CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), nbytes_shared);
+        CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
         cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
     } else {
         cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu
@@ -151,21 +151,37 @@ static __global__ void soft_max_back_f32(
     }
 }
 
-template<int... Ns>
-void increase_shared_mem_limits(std::size_t smpbo)
+template<int... Ns, typename T>
+static void launch_soft_max_kernels(int ncols_x, const float * x, const T * mask, float * dst,
+                             int ncols_param, int nrows_y, float scale, float max_bias,
+                             float m0, float m1, uint32_t n_head_log2, dim3 block_nums,
+                             dim3 block_dims, size_t nbytes_shared, cudaStream_t stream)
 {
-    auto apply_limit = [smpbo](auto I) {
-        constexpr int ncols  = decltype(I)::value;
-        constexpr int block  = (ncols > 1024 ? 1024 : ncols);
-
-        CUDA_SET_SHARED_MEMORY_LIMIT(
-            (soft_max_f32<true, ncols, block, half >),  smpbo);
-        CUDA_SET_SHARED_MEMORY_LIMIT(
-            (soft_max_f32<true, ncols, block, float>), smpbo);
+    const int id       = ggml_cuda_get_device();
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
+
+    auto launch_kernel = [=](auto I) -> bool {
+        constexpr int ncols = decltype(I)::value;
+        constexpr int block = (ncols > 1024 ? 1024 : ncols);
+
+        if (ncols_x == ncols) {
+            CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, ncols, block, T>), smpbo);
+            soft_max_f32<true, ncols, block><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, mask, dst, ncols_param, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+            return true;
+        }
+        return false;
     };
 
-    //unary fold
-    ( apply_limit(std::integral_constant<int, Ns>{}), ... );
+    // unary fold over launch_kernel
+    if ((launch_kernel(std::integral_constant<int, Ns>{}) || ...)) {
+        return;
+    }
+
+    //default case
+    CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, 0, 0, T>), smpbo);
+    soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
+        (x, mask, dst, ncols_param, nrows_y, scale, max_bias, m0, m1, n_head_log2);
 }
 
 
@@ -189,47 +205,8 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
 
 
     if (nbytes_shared <= smpbo) {
-
-        increase_shared_mem_limits<0, 32, 64, 128, 256, 512, 1024, 2048, 4096>(smpbo);
-
-        switch (ncols_x) {
-            case 32:
-                soft_max_f32<true,   32,   32><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 64:
-                soft_max_f32<true,   64,   64><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 128:
-                soft_max_f32<true,  128,  128><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 256:
-                soft_max_f32<true,  256,  256><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 512:
-                soft_max_f32<true,  512,  512><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 1024:
-                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 2048:
-                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            case 4096:
-                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-            default:
-                soft_max_f32<true,    0,    0><<<block_nums, block_dims, nbytes_shared, stream>>>
-                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
-                break;
-        }
+        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(
+            ncols_x, x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2, block_nums, block_dims, nbytes_shared, stream);
     } else {
         const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
         soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);