WIP

am17an · am17an · commit ef28ed24133a · 2025-10-11T15:02:33.000+08:00
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
@@ -136,10 +136,11 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
     }
 
     if (mul_mat_id) {
-        if (type == GGML_TYPE_F32 && src1_ncols > 512) {
+        if (src0_ne[1] <= 1024 && src1_ncols > 512) {
             return false;
-        }
-        if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 512) {
+        } else if(src0_ne[1] > 1024 && src1_ncols > 128) {
+            return false;
+        } else {
             return false;
         }
     } else {
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
@@ -495,7 +495,9 @@ static inline void mul_mat_f_switch_ids(
         const mmf_ids_data * ids_data) {
     const bool has_ids_data = ids_data && ids_data->ids_src_compact;
 
-    if (has_ids_data) {
+    // Use the compact-ids kernel only for larger tiles; for small ncols_dst (< 16)
+    // we prefer the normal mul_mat_f path with has_ids=true.
+    if (has_ids_data && ncols_dst > 16) {
         const int max_tiles = (int) ((ncols_dst + cols_per_block - 1) / cols_per_block);
         if (max_tiles == 0) {
             return;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -6819,15 +6819,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
         }
     }
 
-    // liquid 1b-8b
-    for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
-        for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
-            for (ggml_type type_b : {GGML_TYPE_F32}) {
-                test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
-            }
-        }
-    }
-
     // gpt-oss-20b
     for (int bs : {1, 4, 8, 512}) {
         for (ggml_type type_a : {GGML_TYPE_MXFP4}) {

Original file line number	Diff line number	Diff line change
`@@ -136,10 +136,11 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const`
`136`	`136`	`}`
`137`	`137`
`138`	`138`	`if (mul_mat_id) {`
`139`		`- if (type == GGML_TYPE_F32 && src1_ncols > 512) {`
	`139`	`+ if (src0_ne[1] <= 1024 && src1_ncols > 512) {`
`140`	`140`	`return false;`
`141`		`- }`
`142`		`- if ((type == GGML_TYPE_F16 \|\| type == GGML_TYPE_BF16) && src1_ncols > 512) {`
	`141`	`+ } else if(src0_ne[1] > 1024 && src1_ncols > 128) {`
	`142`	`+ return false;`
	`143`	`+ } else {`
`143`	`144`	`return false;`
`144`	`145`	`}`
`145`	`146`	`} else {`