@@ -633,6 +633,7 @@ struct vk_flash_attn_push_constants {
633
633
uint32_t nev2;
634
634
uint32_t nev3;
635
635
uint32_t nem1;
636
+ uint32_t nem2;
636
637
637
638
uint32_t nb01;
638
639
uint32_t nb02;
@@ -643,7 +644,6 @@ struct vk_flash_attn_push_constants {
643
644
uint32_t nb21;
644
645
uint32_t nb22;
645
646
uint32_t nb23;
646
- uint32_t nb31;
647
647
648
648
float scale;
649
649
float max_bias;
@@ -658,6 +658,7 @@ struct vk_flash_attn_push_constants {
658
658
uint32_t split_kv;
659
659
uint32_t k_num;
660
660
};
661
+ static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
661
662
662
663
struct vk_op_push_constants {
663
664
uint32_t KX;
@@ -756,6 +757,14 @@ struct vk_op_rope_push_constants {
756
757
struct vk_op_soft_max_push_constants {
757
758
uint32_t KX;
758
759
uint32_t KY;
760
+ uint32_t ne00;
761
+ uint32_t ne01;
762
+ uint32_t ne02;
763
+ uint32_t ne12;
764
+ uint32_t ne13;
765
+ uint32_t nb11;
766
+ uint32_t nb12;
767
+ uint32_t nb13;
759
768
float scale;
760
769
float max_bias;
761
770
float m0;
@@ -6040,7 +6049,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6040
6049
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
6041
6050
6042
6051
const uint32_t nem1 = mask ? mask->ne[1] : 0;
6043
- const uint32_t nbm1 = mask ? mask->nb[1 ] : 0;
6052
+ const uint32_t nem2 = mask ? mask->ne[2 ] : 0;
6044
6053
6045
6054
const uint32_t D = neq0;
6046
6055
uint32_t N = neq1;
@@ -6203,7 +6212,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6203
6212
// Try to use split_k when KV is large enough to be worth the overhead
6204
6213
if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
6205
6214
// Try to run two workgroups per SM.
6206
- split_k = ctx->device->shader_core_count * 2 / workgroups_y;
6215
+ split_k = ctx->device->shader_core_count * 2 / ( workgroups_y * workgroups_z) ;
6207
6216
if (split_k > 1) {
6208
6217
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
6209
6218
// of "align", so recompute split_k based on that.
@@ -6213,9 +6222,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6213
6222
}
6214
6223
}
6215
6224
6216
- // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
6217
- // and the per-row m and L values (ne1 rows).
6218
- const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
6225
+ // Reserve space for split_k temporaries. For each split x batch , we need to store the O matrix (D x ne1)
6226
+ // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
6227
+ const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0;
6219
6228
if (split_k_size > ctx->device->max_memory_allocation_size) {
6220
6229
GGML_ABORT("Requested preallocation size is too large");
6221
6230
}
@@ -6307,11 +6316,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6307
6316
(uint32_t)neq2, (uint32_t)neq3,
6308
6317
(uint32_t)nek2, (uint32_t)nek3,
6309
6318
(uint32_t)nev2, (uint32_t)nev3,
6310
- nem1,
6319
+ nem1, nem2,
6311
6320
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
6312
6321
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
6313
6322
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
6314
- nbm1,
6315
6323
scale, max_bias, logit_softcap,
6316
6324
mask != nullptr, n_head_log2, m0, m1,
6317
6325
gqa_ratio, split_kv, split_k };
@@ -6334,13 +6342,13 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
6334
6342
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
6335
6343
6336
6344
ggml_vk_sync_buffers(subctx);
6337
- const std::array<uint32_t, 3 > pc2 = { D, (uint32_t)ne1, split_k };
6345
+ const std::array<uint32_t, 4 > pc2 = { D, (uint32_t)ne1, (uint32_t)ne3 , split_k };
6338
6346
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
6339
6347
{
6340
6348
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
6341
6349
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
6342
6350
},
6343
- pc2, { (uint32_t)ne1, 1, 1 });
6351
+ pc2, { (uint32_t)ne1, 1, (uint32_t)ne3 });
6344
6352
} else {
6345
6353
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
6346
6354
{
@@ -7666,7 +7674,13 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
7666
7674
const uint32_t nrows_x = (uint32_t)ggml_nrows(src0);
7667
7675
const uint32_t nrows_y = (uint32_t)src0->ne[1];
7668
7676
7669
- const uint32_t n_head_kv = nrows_x/nrows_y;
7677
+ const uint32_t ne12 = src1 ? (uint32_t)(src1->ne[2]) : 0u;
7678
+ const uint32_t ne13 = src1 ? (uint32_t)(src1->ne[3]) : 0u;
7679
+ const uint32_t nb11 = src1 ? (uint32_t)(src1->nb[1] / src1->nb[0]) : 0u;
7680
+ const uint32_t nb12 = src1 ? (uint32_t)(src1->nb[2] / src1->nb[0]) : 0u;
7681
+ const uint32_t nb13 = src1 ? (uint32_t)(src1->nb[3] / src1->nb[0]) : 0u;
7682
+
7683
+ const uint32_t n_head_kv = src0->ne[2];
7670
7684
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
7671
7685
7672
7686
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -7675,6 +7689,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
7675
7689
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
7676
7690
ncols,
7677
7691
src1 != nullptr ? nrows_y : (uint32_t)0,
7692
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
7693
+ ne12, ne13,
7694
+ nb11, nb12, nb13,
7678
7695
scale, max_bias,
7679
7696
m0, m1,
7680
7697
n_head_log2,
@@ -10248,11 +10265,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10248
10265
if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) {
10249
10266
return false;
10250
10267
}
10251
- // TODO: support broadcast
10252
- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10253
- if (op->src[0]->ne[3] != 1) {
10254
- return false;
10255
- }
10256
10268
// It's straightforward to support different K/V dequant, but would
10257
10269
// significantly increase the number of pipelines
10258
10270
if (op->src[1]->type != op->src[2]->type) {
@@ -10413,13 +10425,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10413
10425
case GGML_OP_DIAG_MASK_INF:
10414
10426
return true;
10415
10427
case GGML_OP_SOFT_MAX:
10416
- // TODO: support batching
10417
- if (op->src[0]->ne[3] != 1) {
10418
- return false;
10419
- }
10420
- // TODO: support broadcast
10421
- // ref: https://github.com/ggml-org/llama.cpp/pull/14435
10422
- return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
10423
10428
case GGML_OP_SOFT_MAX_BACK:
10424
10429
case GGML_OP_ARGSORT:
10425
10430
case GGML_OP_SUM:
0 commit comments