Skip to content

Commit b18bf6a

Browse files
committed
Revert "ggml: CUDA: add head size 72 for flash-attn (ggml-org#16962)"
This reverts commit 622cd01.
1 parent eb19aa3 commit b18bf6a

File tree

5 files changed

+5
-44
lines changed

5 files changed

+5
-44
lines changed

ggml/src/ggml-cuda/fattn-tile.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
1414
GGML_ASSERT(V->ne[0] == K->ne[0]);
1515
ggml_cuda_flash_attn_ext_tile_case< 64, 64>(ctx, dst);
1616
} break;
17-
case 72: {
18-
GGML_ASSERT(V->ne[0] == K->ne[0]);
19-
ggml_cuda_flash_attn_ext_tile_case< 72, 72>(ctx, dst);
20-
} break;
2117
case 80: {
2218
GGML_ASSERT(V->ne[0] == K->ne[0]);
2319
ggml_cuda_flash_attn_ext_tile_case< 80, 80>(ctx, dst);

ggml/src/ggml-cuda/fattn-tile.cuh

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
// nbatch_K == number of K columns to load in parallel for KQ calculation
77

88
// TODO optimize kernel parameters for FP16 NVIDIA (P100)
9-
// TODO optimize kernel parameters for head sizes 40, 72, 80, 96, 112
9+
// TODO optimize kernel parameters for head sizes 40, 80, 96, 112
1010

1111
// The ROCm compiler cannot handle templating in __launch_bounds__.
1212
// As a workaround, define a macro to package the kernel parameters as uint32_t:
@@ -32,12 +32,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
3232
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 256, 2, 64, 64)
3333
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
3434

35-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 64, 72)
36-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 64, 72)
37-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 64, 72)
38-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 64, 72)
39-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 64, 72)
40-
4135
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 64, 40)
4236
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 64, 40)
4337
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 64, 40)
@@ -86,12 +80,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
8680
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 128, 3, 64, 64)
8781
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
8882

89-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
90-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
91-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
92-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
93-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
94-
9583
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
9684
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
9785
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -142,13 +130,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
142130
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
143131
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 64)
144132

145-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
146-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
147-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
148-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
149-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
150-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
151-
152133
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
153134
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
154135
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -204,13 +185,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
204185
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 128, 4, 64, 64)
205186
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 128, 5, 64, 64)
206187

207-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
208-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
209-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
210-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
211-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
212-
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
213-
214188
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
215189
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
216190
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -749,7 +723,7 @@ static __global__ void flash_attn_tile(
749723

750724
if (
751725
#ifdef GGML_USE_WMMA_FATTN
752-
(ncols2 != 1 && DV != 40 && DV != 72 && DV != 512) ||
726+
(ncols2 != 1 && DV != 40 && DV != 512) ||
753727
#endif // GGML_USE_WMMA_FATTN
754728
(use_logit_softcap && !(DV == 128 || DV == 256))
755729
) {
@@ -1224,7 +1198,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
12241198

12251199
extern DECL_FATTN_TILE_CASE( 40, 40);
12261200
extern DECL_FATTN_TILE_CASE( 64, 64);
1227-
extern DECL_FATTN_TILE_CASE( 72, 72);
12281201
extern DECL_FATTN_TILE_CASE( 80, 80);
12291202
extern DECL_FATTN_TILE_CASE( 96, 96);
12301203
extern DECL_FATTN_TILE_CASE(112, 112);

ggml/src/ggml-cuda/fattn.cu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
229229
switch (K->ne[0]) {
230230
case 40:
231231
case 64:
232-
case 72:
233232
case 80:
234233
case 96:
235234
case 128:
@@ -282,7 +281,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
282281
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
283282

284283
// If Turing tensor cores available, use them:
285-
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
284+
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40) {
286285
if (can_use_vector_kernel) {
287286
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
288287
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
@@ -313,7 +312,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
313312
}
314313

315314
// Use the WMMA kernel if possible:
316-
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) {
315+
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 576) {
317316
if (can_use_vector_kernel && Q->ne[1] <= 2) {
318317
return BEST_FATTN_KERNEL_VEC;
319318
}

ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu

Lines changed: 0 additions & 5 deletions
This file was deleted.

ggml/src/ggml-cuda/template-instances/generate_cu_files.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from glob import glob
44
import os
55

6-
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 576]
6+
HEAD_SIZES_KQ = [40, 64, 80, 96, 112, 128, 256, 576]
77

88
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0"]
99

@@ -81,8 +81,6 @@ def get_short_name(long_quant_name):
8181
for head_size_kq in HEAD_SIZES_KQ:
8282
if head_size_kq == 40:
8383
continue
84-
if head_size_kq == 72:
85-
continue
8684
if head_size_kq != 576 and ncols2 == 16:
8785
continue
8886
if head_size_kq == 576 and ncols2 != 16:

0 commit comments

Comments
 (0)