@@ -2404,13 +2404,30 @@ static struct ggml_cgraph * llm_build_llama(
2404
2404
}
2405
2405
#endif // GGML_USE_CUBLAS
2406
2406
2407
+ // KQ_scale
2407
2408
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d (ctx0, GGML_TYPE_F32, 1 );
2408
2409
ggml_allocr_alloc (lctx.alloc , KQ_scale);
2409
2410
if (!ggml_allocr_is_measure (lctx.alloc )) {
2410
2411
ggml_set_f32 (KQ_scale, 1 .0f /sqrtf (float (n_embd)/n_head));
2411
2412
}
2412
2413
ggml_set_name (KQ_scale, " 1/sqrt(n_embd_head)" );
2413
2414
2415
+ // KQ_mask
2416
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_past + N, N, 1 );
2417
+ ggml_allocr_alloc (lctx.alloc , KQ_mask);
2418
+ if (!ggml_allocr_is_measure (lctx.alloc )) {
2419
+ float * data = (float *) KQ_mask->data ;
2420
+ memset (data, 0 , ggml_nbytes (KQ_mask));
2421
+
2422
+ for (int h = 0 ; h < 1 ; ++h) {
2423
+ for (int j = 0 ; j < N; ++j) {
2424
+ for (int i = n_past + j + 1 ; i < n_past + N; ++i) {
2425
+ data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
2426
+ }
2427
+ }
2428
+ }
2429
+ }
2430
+
2414
2431
for (int il = 0 ; il < n_layer; ++il) {
2415
2432
ggml_format_name (inpL, " layer_inp_%d" , il);
2416
2433
@@ -2447,11 +2464,11 @@ static struct ggml_cgraph * llm_build_llama(
2447
2464
offload_func_kq (tmpq);
2448
2465
ggml_set_name (tmpq, " tmpq" );
2449
2466
2450
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2467
+ struct ggml_tensor * Kcur = ggml_rope_custom (ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2451
2468
offload_func_kq (Kcur);
2452
2469
ggml_set_name (Kcur, " Kcur" );
2453
2470
2454
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2471
+ struct ggml_tensor * Qcur = ggml_rope_custom (ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2455
2472
offload_func_kq (Qcur);
2456
2473
ggml_set_name (Qcur, " Qcur" );
2457
2474
@@ -2502,17 +2519,18 @@ static struct ggml_cgraph * llm_build_llama(
2502
2519
2503
2520
// KQ_scaled = KQ / sqrt(n_embd_head)
2504
2521
// KQ_scaled shape [n_past + N, N, n_head, 1]
2505
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
2522
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
2506
2523
offload_func_kq (KQ_scaled);
2507
2524
ggml_set_name (KQ_scaled, " KQ_scaled" );
2508
2525
2509
2526
// KQ_masked = mask_past(KQ_scaled)
2510
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2527
+ struct ggml_tensor * KQ_masked = ggml_add (ctx0, KQ_scaled, KQ_mask);
2528
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2511
2529
offload_func_kq (KQ_masked);
2512
2530
ggml_set_name (KQ_masked, " KQ_masked" );
2513
2531
2514
2532
// KQ = soft_max(KQ_masked)
2515
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
2533
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
2516
2534
offload_func_v (KQ_soft_max);
2517
2535
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
2518
2536
@@ -2783,8 +2801,8 @@ static struct ggml_cgraph * llm_build_baichaun(
2783
2801
struct ggml_tensor * Qcur;
2784
2802
switch (model.type ) {
2785
2803
case MODEL_7B:
2786
- Kcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2787
- Qcur = ggml_rope_custom_inplace (ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2804
+ Kcur = ggml_rope_custom (ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2805
+ Qcur = ggml_rope_custom (ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0 , 0 , freq_base, freq_scale);
2788
2806
break ;
2789
2807
case MODEL_13B:
2790
2808
Kcur = ggml_reshape_3d (ctx0, tmpk, n_embd/n_head, n_head, N);
@@ -2847,7 +2865,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2847
2865
2848
2866
// KQ_scaled = KQ / sqrt(n_embd_head)
2849
2867
// KQ_scaled shape [n_past + N, N, n_head, 1]
2850
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
2868
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
2851
2869
offload_func_kq (KQ_scaled);
2852
2870
ggml_set_name (KQ_scaled, " KQ_scaled" );
2853
2871
@@ -2856,7 +2874,7 @@ static struct ggml_cgraph * llm_build_baichaun(
2856
2874
2857
2875
switch (model.type ) {
2858
2876
case MODEL_7B:
2859
- KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2877
+ KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
2860
2878
break ;
2861
2879
case MODEL_13B:
2862
2880
KQ_scaled_alibi =ggml_alibi (ctx0, KQ_scaled, n_past, n_head, 8 );
@@ -2867,13 +2885,13 @@ static struct ggml_cgraph * llm_build_baichaun(
2867
2885
GGML_ASSERT (false );
2868
2886
}
2869
2887
// KQ_masked = mask_past(KQ_scaled)
2870
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
2888
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
2871
2889
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2872
2890
// offload_func_kq(KQ_masked);
2873
2891
// ggml_set_name(KQ_masked, "KQ_masked");
2874
2892
2875
2893
// KQ = soft_max(KQ_masked)
2876
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
2894
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
2877
2895
offload_func_v (KQ_soft_max);
2878
2896
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
2879
2897
@@ -3179,9 +3197,9 @@ static struct ggml_cgraph * llm_build_falcon(
3179
3197
offload_func_v (tmpv);
3180
3198
3181
3199
// using mode = 2 for neox mode
3182
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace (ctx0, tmpq, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
3200
+ struct ggml_tensor * Qcur = ggml_rope_custom (ctx0, tmpq, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
3183
3201
offload_func_kq (Qcur);
3184
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace (ctx0, tmpk, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
3202
+ struct ggml_tensor * Kcur = ggml_rope_custom (ctx0, tmpk, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale);
3185
3203
offload_func_kq (Kcur);
3186
3204
3187
3205
{
@@ -3220,15 +3238,15 @@ static struct ggml_cgraph * llm_build_falcon(
3220
3238
offload_func_kq (KQ);
3221
3239
ggml_set_name (KQ, " KQ" );
3222
3240
3223
- struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
3241
+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
3224
3242
offload_func_kq (KQ_scaled);
3225
3243
ggml_set_name (KQ_scaled, " KQ_scaled" );
3226
3244
3227
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
3245
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
3228
3246
offload_func_kq (KQ_masked);
3229
3247
ggml_set_name (KQ_masked, " KQ_masked" );
3230
3248
3231
- struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
3249
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
3232
3250
offload_func_v (KQ_soft_max);
3233
3251
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
3234
3252
0 commit comments