q10
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip
Lines changed: 73 additions & 1 deletion b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip
Lines changed: 73 additions & 1 deletion
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_interwave_v2.hip
Lines changed: 45 additions & 20 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_interwave_v2.hip
Lines changed: 45 additions & 20 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_128x32x16x512_16x16_1x1_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_interwave_v2.hip
Lines changed: 45 additions & 20 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_128x32x16x512_16x16_1x1_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_interwave_v2.hip
Lines changed: 45 additions & 20 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3.hip
Lines changed: 64 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3.hip
Lines changed: 64 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_256x128x160x128_16x16_4x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3.hip
Lines changed: 64 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_256x128x160x128_16x16_4x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3.hip
Lines changed: 64 additions & 0 deletions
@@ -327,14 +327,86 @@ static const std::map<int, RowwiseKernel> N_5120_K_640_dispatch_table = {
   { 5984, fp8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
 };
 
+static const std::map<int, RowwiseKernel> N_4096_K_5120_dispatch_table = {
+  { 16, fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_interwave_v2},
+  { 32, fp8_rowwise_128x32x16x512_16x16_1x1_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_interwave_v2},
+  { 48, fp8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v3},
+  { 128, fp8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 256, fp8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 288, fp8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 576, fp8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 896, fp8_rowwise_256x128x96x128_16x16_4x3_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 1152, fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1392, fp8_rowwise_256x128x160x128_16x16_4x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 1440, fp8_rowwise_256x160x128x128_16x16_5x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1776, fp8_rowwise_256x128x96x128_16x16_4x3_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 1824, fp8_rowwise_256x96x128x128_16x16_3x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2240, fp8_rowwise_256x160x96x128_16x16_5x3_8x32x1_8x32x1_1x32x1x8_4x4x1_1x1_intrawave_v3},
+  { 2496, fp8_rowwise_256x192x192x128_16x16_6x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2816, fp8_rowwise_256x256x160x128_16x16_8x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 2896, fp8_rowwise_256x224x192x128_16x16_7x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 3040, fp8_rowwise_256x160x256x128_16x16_5x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 3072, fp8_rowwise_256x192x224x128_16x16_6x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 3328, fp8_rowwise_256x256x192x128_16x16_8x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 3648, fp8_rowwise_256x192x256x128_16x16_6x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4096, fp8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 4256, fp8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4832, fp8_rowwise_256x256x256x64_32x32_4x4_4x64x1_4x64x1_1x32x1x8_8x8x1_1x1_intrawave_v4},
+  { 4864, fp8_rowwise_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 5152, fp8_rowwise_256x224x160x128_16x16_7x5_8x32x1_8x32x1_1x32x1x8_4x4x1_1x1_intrawave_v3},
+  { 5184, fp8_rowwise_256x192x192x128_16x16_6x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 5888, fp8_rowwise_256x256x160x128_16x16_8x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 5920, fp8_rowwise_256x160x256x128_16x16_5x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 5984, fp8_rowwise_256x224x192x128_16x16_7x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+};
+
+static const std::map<int, RowwiseKernel> N_5120_K_2048_dispatch_table = {
+  { 48, fp8_rowwise_256x16x64x512_16x16_1x1_32x8x1_32x8x1_1x16x1x16_4x4x1_1x1_intrawave_v2},
+  { 96, fp8_rowwise_256x32x64x512_16x16_1x2_32x8x1_32x8x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 192, fp8_rowwise_256x64x64x512_32x32_1x1_32x8x1_32x8x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 224, fp8_rowwise_256x32x128x256_32x32_1x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 384, fp8_rowwise_256x128x64x256_32x32_2x1_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 448, fp8_rowwise_256x64x128x256_32x32_1x2_16x16x1_16x16x1_1x32x1x8_8x8x1_1x1_intrawave_v3},
+  { 560, fp8_rowwise_256x80x128x256_16x16_5x2_16x16x1_16x16x1_1x16x1x16_8x8x1_1x2_intrawave_v3},
+  { 608, fp8_rowwise_256x128x96x128_16x16_4x3_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 672, fp8_rowwise_256x96x128x128_16x16_3x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 896, fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1008, fp8_rowwise_256x128x160x128_16x16_4x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 1120, fp8_rowwise_256x160x128x128_16x16_5x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1408, fp8_rowwise_256x128x96x128_16x16_4x3_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 1440, fp8_rowwise_256x96x128x128_16x16_3x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1536, fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 1600, fp8_rowwise_256x160x96x128_16x16_5x3_8x32x1_8x32x1_1x32x1x8_4x4x1_1x1_intrawave_v3},
+  { 1920, fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2112, fp8_rowwise_256x192x192x128_16x16_6x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2400, fp8_rowwise_256x160x256x128_16x16_5x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2464, fp8_rowwise_256x224x192x128_16x16_7x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2496, fp8_rowwise_256x192x224x128_16x16_6x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 2816, fp8_rowwise_256x256x192x128_16x16_8x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 2880, fp8_rowwise_256x192x256x128_16x16_6x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 3328, fp8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 3360, fp8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 3840, fp8_rowwise_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4224, fp8_rowwise_256x192x192x128_16x16_6x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4736, fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4864, fp8_rowwise_256x256x160x128_16x16_8x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 4928, fp8_rowwise_256x224x192x128_16x16_7x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 4992, fp8_rowwise_256x192x224x128_16x16_6x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+  { 5632, fp8_rowwise_256x256x192x128_16x16_8x6_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 5760, fp8_rowwise_256x192x256x128_16x16_6x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3},
+  { 5984, fp8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3},
+};
+
 static const std::unordered_map<std::tuple<int, int>, NKLookupTableType, IntTupleHash> NK_lookup_table = {
   {{7168, 8192}, N_7168_K_8192_dispatch_table},
   {{8192, 3584}, N_8192_K_3584_dispatch_table},
   {{1024, 5120}, N_1024_K_5120_dispatch_table},
   {{5120, 1024}, N_5120_K_1024_dispatch_table},
   {{2048, 5120}, N_2048_K_5120_dispatch_table},
   {{896, 5120}, N_896_K_5120_dispatch_table},
-  {{5120, 640}, N_5120_K_640_dispatch_table}
+  {{5120, 640}, N_5120_K_640_dispatch_table},
+  {{4096, 5120}, N_4096_K_5120_dispatch_table},
+  {{5120, 2048}, N_5120_K_2048_dispatch_table}
 };
 
 RowwiseKernel rowwise_nk_lookup(int M, const NKLookupTableType& table) {
 
@@ -15,25 +15,50 @@ fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_interwave_v
     at::Tensor x_scale,
     at::Tensor w_scale,
     at::Tensor Y) {
-  using DeviceGemmInstance = DeviceGemmHelper<
-      128,
-      16,
-      32,
-      512,
-      16,
-      16,
-      1,
-      1,
-      S<32, 4, 1>,
-      S<32, 4, 1>,
-      S<1, 16, 1, 8>,
-      S<4, 4, 1>,
-      1,
-      1,
-      ck::BlockGemmPipelineScheduler::Interwave,
-      ck::BlockGemmPipelineVersion::v2,
-      ck::tensor_operation::device::GemmSpecialization::Default>;
-  // Run kernel instance.
-  return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  int K = WQ.size(1);
+  bool kpad = (K % 512 != 0);
+  if (kpad) {
+    using DeviceGemmInstance = DeviceGemmHelper<
+        128,
+        16,
+        32,
+        512,
+        16,
+        16,
+        1,
+        1,
+        S<32, 4, 1>,
+        S<32, 4, 1>,
+        S<1, 16, 1, 8>,
+        S<4, 4, 1>,
+        1,
+        1,
+        ck::BlockGemmPipelineScheduler::Interwave,
+        ck::BlockGemmPipelineVersion::v2,
+        ck::tensor_operation::device::GemmSpecialization::KPadding>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  } else {        
+    using DeviceGemmInstance = DeviceGemmHelper<
+        128,
+        16,
+        32,
+        512,
+        16,
+        16,
+        1,
+        1,
+        S<32, 4, 1>,
+        S<32, 4, 1>,
+        S<1, 16, 1, 8>,
+        S<4, 4, 1>,
+        1,
+        1,
+        ck::BlockGemmPipelineScheduler::Interwave,
+        ck::BlockGemmPipelineVersion::v2,
+        ck::tensor_operation::device::GemmSpecialization::Default>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  }
 }
 
@@ -15,25 +15,50 @@ fp8_rowwise_128x32x16x512_16x16_1x1_32x4x1_32x4x1_1x32x1x4_4x4x1_1x1_interwave_v
     at::Tensor x_scale,
     at::Tensor w_scale,
     at::Tensor Y) {
-  using DeviceGemmInstance = DeviceGemmHelper<
-      128,
-      32,
-      16,
-      512,
-      16,
-      16,
-      1,
-      1,
-      S<32, 4, 1>,
-      S<32, 4, 1>,
-      S<1, 32, 1, 4>,
-      S<4, 4, 1>,
-      1,
-      1,
-      ck::BlockGemmPipelineScheduler::Interwave,
-      ck::BlockGemmPipelineVersion::v2,
-      ck::tensor_operation::device::GemmSpecialization::Default>;
-  // Run kernel instance.
-  return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  int K = WQ.size(1);
+  bool kpad = (K % 512 != 0);
+  if (kpad) {
+    using DeviceGemmInstance = DeviceGemmHelper<
+        128,
+        32,
+        16,
+        512,
+        16,
+        16,
+        1,
+        1,
+        S<32, 4, 1>,
+        S<32, 4, 1>,
+        S<1, 32, 1, 4>,
+        S<4, 4, 1>,
+        1,
+        1,
+        ck::BlockGemmPipelineScheduler::Interwave,
+        ck::BlockGemmPipelineVersion::v2,
+        ck::tensor_operation::device::GemmSpecialization::KPadding>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  } else {        
+    using DeviceGemmInstance = DeviceGemmHelper<
+        128,
+        32,
+        16,
+        512,
+        16,
+        16,
+        1,
+        1,
+        S<32, 4, 1>,
+        S<32, 4, 1>,
+        S<1, 32, 1, 4>,
+        S<4, 4, 1>,
+        1,
+        1,
+        ck::BlockGemmPipelineScheduler::Interwave,
+        ck::BlockGemmPipelineVersion::v2,
+        ck::tensor_operation::device::GemmSpecialization::Default>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  }
 }
 
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fp8_rowwise_common.h"
+
+at::Tensor
+fp8_rowwise_256x128x128x128_16x16_4x4_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y) {
+  int K = WQ.size(1);
+  bool kpad = (K % 128 != 0);
+  if (kpad) {
+    using DeviceGemmInstance = DeviceGemmHelper<
+        256,
+        128,
+        128,
+        128,
+        16,
+        16,
+        4,
+        4,
+        S<8, 32, 1>,
+        S<8, 32, 1>,
+        S<1, 32, 1, 8>,
+        S<8, 8, 1>,
+        1,
+        2,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v3,
+        ck::tensor_operation::device::GemmSpecialization::KPadding>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  } else {        
+    using DeviceGemmInstance = DeviceGemmHelper<
+        256,
+        128,
+        128,
+        128,
+        16,
+        16,
+        4,
+        4,
+        S<8, 32, 1>,
+        S<8, 32, 1>,
+        S<1, 32, 1, 8>,
+        S<8, 8, 1>,
+        1,
+        2,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v3,
+        ck::tensor_operation::device::GemmSpecialization::Default>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  }
+}
+
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fp8_rowwise_common.h"
+
+at::Tensor
+fp8_rowwise_256x128x160x128_16x16_4x5_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y) {
+  int K = WQ.size(1);
+  bool kpad = (K % 128 != 0);
+  if (kpad) {
+    using DeviceGemmInstance = DeviceGemmHelper<
+        256,
+        128,
+        160,
+        128,
+        16,
+        16,
+        4,
+        5,
+        S<8, 32, 1>,
+        S<8, 32, 1>,
+        S<1, 64, 1, 4>,
+        S<8, 8, 1>,
+        2,
+        1,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v3,
+        ck::tensor_operation::device::GemmSpecialization::KPadding>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  } else {        
+    using DeviceGemmInstance = DeviceGemmHelper<
+        256,
+        128,
+        160,
+        128,
+        16,
+        16,
+        4,
+        5,
+        S<8, 32, 1>,
+        S<8, 32, 1>,
+        S<1, 64, 1, 4>,
+        S<8, 8, 1>,
+        2,
+        1,
+        ck::BlockGemmPipelineScheduler::Intrawave,
+        ck::BlockGemmPipelineVersion::v3,
+        ck::tensor_operation::device::GemmSpecialization::Default>;
+    // Run kernel instance.
+    return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+  }
+}
+