Allow q8_0 KV cache for head size 256 #330

Nexesenex · ikawrakow · Nexesenex · commit 9999fa292adb · 2025-05-14T23:17:20.000+02:00
Co-Authored-By: Kawrakow &lt;iwankawrakow@gmail.com&gt;
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -185,6 +185,9 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
     //FATTN_VEC_F16_CASE(128, GGML_TYPE_IQ4_NL,  GGML_TYPE_F16)
 
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    // FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 
     //FATTN_VEC_F16_CASE(256, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
@@ -199,6 +202,9 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
     FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    // FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 
     //FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
@@ -314,6 +320,9 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
     FATTN_VEC_F32_CASE(128, GGML_TYPE_F16,  GGML_TYPE_F16)
     //FATTN_VEC_F32_CASE(128, GGML_TYPE_IQ4_NL, GGML_TYPE_F16)
 
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    // FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 
     //FATTN_VEC_F32_CASE(256, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
@@ -327,6 +336,9 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
     FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
     FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
+    // FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16)
+    FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0)
     FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
 	
     //FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16,  GGML_TYPE_IQ4_NL)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3529,6 +3529,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 // different head sizes of K and V are not supported yet
                 return false;
             }
+            if (op->src[1]->ne[0] == 256 && op->src[2]->ne[0] == 256 &&
+               (op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_Q8_0) &&
+               (op->src[2]->type == GGML_TYPE_F16 || op->src[2]->type == GGML_TYPE_Q8_0)) {
+                return true;
+            }
             if (op->src[0]->ne[0] == 192) {
                 return false;
             }
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-q8_0-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-q8_0-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);

Original file line number	Diff line number	Diff line change
`@@ -3529,6 +3529,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3529`	`3529`	`// different head sizes of K and V are not supported yet`
`3530`	`3530`	`return false;`
`3531`	`3531`	`}`
	`3532`	`+ if (op->src[1]->ne[0] == 256 && op->src[2]->ne[0] == 256 &&`
	`3533`	`+ (op->src[1]->type == GGML_TYPE_F16 \|\| op->src[1]->type == GGML_TYPE_Q8_0) &&`
	`3534`	`+ (op->src[2]->type == GGML_TYPE_F16 \|\| op->src[2]->type == GGML_TYPE_Q8_0)) {`
	`3535`	`+ return true;`
	`3536`	`+ }`
`3532`	`3537`	`if (op->src[0]->ne[0] == 192) {`
`3533`	`3538`	`return false;`
`3534`	`3539`	`}`