use cooperative schedule in scaled_mm for fast_accum=false (#144809)

ngimel · pytorchmergebot · commit 4e1834f5f3b6 · 2025-01-15T23:04:14.000Z
This improves perf for large matrices by more than 2x, more detailed benchmark coming. On master ![image](https://github.com/user-attachments/assets/fc6a0987-5b82-475d-a2ff-b46641bb17dc) On this branch <img width="601" alt="image" src="https://github.com/user-attachments/assets/7f55152b-1110-45e4-b2ea-6f274d543869" /> A plot similar to pytorch/ao#1325 (comment) <details> <summary>Benchmarking code:</summary> ```python import torch from triton.testing import do_bench import itertools def fn_aten_scales(a, b, scale_a, scale_b, use_fast_accum=False): return torch._scaled_mm(a, b.t(), scale_a.view(-1, 1), scale_b.view(1, -1), use_fast_accum=use_fast_accum, out_dtype=torch.bfloat16) def fn_aten(a, b, scale, use_fast_accum=False): return torch._scaled_mm(a, b.t(), scale, scale, use_fast_accum=use_fast_accum, out_dtype=torch.bfloat16) for i,j,k in itertools.product(range(9, 15), range(9, 15), range(9, 15)): m = 2**i n = 2**j k = 2**k a=torch.randn(m, k, device="cuda").to(dtype=torch.float8_e4m3fn) b=torch.randn(n, k, device="cuda").to(dtype=torch.float8_e4m3fn) scale_a = torch.randint(1, 11, (a.shape[0],), device="cuda", dtype=torch.float32) scale_b = torch.randint(1, 11, (b.shape[0],), device="cuda", dtype=torch.float32) scale_0 = torch.randn((), device="cuda", dtype=torch.float32) ms_rowwise_fast = do_bench(lambda: fn_aten_scales(a, b, scale_a, scale_b, use_fast_accum=True), warmup=25, rep=50) ms_rowwise_slow = do_bench(lambda: fn_aten_scales(a, b, scale_a, scale_b, use_fast_accum=False), warmup=25, rep=50) ms_tensor_fast = do_bench(lambda: fn_aten(a, b, scale_0, use_fast_accum=True), warmup=25, rep=50) ms_tensor_slow = do_bench(lambda: fn_aten(a, b, scale_0, use_fast_accum=False), warmup=25, rep=50) print(f"m={m}, n={n}, k={k}, fast={ms_rowwise_fast}, slow={ms_rowwise_slow}, ratio_tw={ms_tensor_slow /ms_tensor_fast}, ratio_rw={ms_rowwise_slow / ms_rowwise_fast}") ``` </details> Higher N/K values still have about 40% penalty, perhaps some additional heuristics tweaks would be useful. Pull Request resolved: #144809 Approved by: https://github.com/drisspg
diff --git a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@@ -105,27 +105,34 @@ using Cast = cutlass::epilogue::fusion::Sm90Compute<
     DtypeEpilogue,
     cutlass::FloatRoundStyle::round_to_nearest>;
 
-template <bool PingPong, bool FastAccum>
+template <bool LargeTile, bool FastAccum>
 struct Schedule;
 
 template <>
-struct Schedule</*PingPong=*/false, /*FastAccum=*/false> {
+struct Schedule</*LargeTile=*/false, /*FastAccum=*/false> {
   using type = cutlass::gemm::KernelTmaWarpSpecialized;
+  using epilogue_type = cutlass::epilogue::TmaWarpSpecialized;
 };
 
 template <>
-struct Schedule</*PingPong=*/true, /*FastAccum=*/false> {
-  using type = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+struct Schedule</*LargeTile=*/true, /*FastAccum=*/false> {
+  // For a 128x128x128 tile with fastAccum = false, using
+  // pingpong schedule will lead to spilling, and WarpSpecialized w/o pingpong
+  // is slow
+  using type = cutlass::gemm::KernelTmaWarpSpecializedCooperative;
+  using epilogue_type = cutlass::epilogue::TmaWarpSpecializedCooperative;
 };
 
 template <>
-struct Schedule</*PingPong=*/false, /*FastAccum=*/true> {
+struct Schedule</*LargeTile=*/false, /*FastAccum=*/true> {
   using type = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using epilogue_type = cutlass::epilogue::TmaWarpSpecialized;
 };
 
 template <>
-struct Schedule</*PingPong=*/true, /*FastAccum=*/true> {
+struct Schedule</*LargeTile=*/true, /*FastAccum=*/true> {
   using type = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using epilogue_type = cutlass::epilogue::TmaWarpSpecialized;
 };
 
 int ceildiv(int a, int b) {
@@ -140,7 +147,6 @@ int round_up_to_nearest_multiple(int a, int b) {
 template <
     typename TileShape,
     typename ClusterShape,
-    typename PingPong,
     typename Transposed,
     typename FastAccum,
     typename DtypeA,
@@ -226,6 +232,8 @@ void f8f8bf16_rowwise_impl(
           Bias,
           AccumScale>>;
 
+  constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           ArchTag,
@@ -241,7 +249,7 @@ void f8f8bf16_rowwise_impl(
           DtypeOutput,
           LayoutOutput,
           AlignmentOutput,
-          cutlass::epilogue::TmaWarpSpecialized,
+          typename Schedule<large_tile, FastAccum::value>::epilogue_type,
           EpilogueEVT>::CollectiveOp;
 
   using CollectiveMainloop =
@@ -259,7 +267,7 @@ void f8f8bf16_rowwise_impl(
           ClusterShape,
           cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
               sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          typename Schedule<PingPong::value, FastAccum::value>::type>::
+          typename Schedule<large_tile, FastAccum::value>::type>::
           CollectiveOp;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
@@ -370,13 +378,11 @@ void dispatch_fp8_rowwise_kernel_on_tile_size(
     return f8f8bf16_rowwise_impl<
         /*TileShape=*/cute::Shape<cute::_64, cute::_128, cute::_128>,
         ClusterShape,
-        /*PingPong=*/std::false_type,
         Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
   } else {
     return f8f8bf16_rowwise_impl<
         /*TileShape=*/cute::Shape<cute::_128, cute::_128, cute::_128>,
         ClusterShape,
-        /*PingPong=*/std::true_type,
         Types...>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
   }
 }