@@ -2113,7 +2113,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
21132113 src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
21142114
21152115 const int cc = ggml_cuda_info ().devices [ggml_cuda_get_device ()].cc ;
2116- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , is_mul_mat_id ? src1->ne [2 ] : src1->ne [1 ]);
2116+ use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src0-> nb , is_mul_mat_id ? src1->ne [2 ] : src1->ne [1 ]);
21172117
21182118 const bool split = ggml_backend_buft_is_cuda_split (src0->buffer ->buft ) ||
21192119 ggml_backend_buft_is_cuda_split (src1->buffer ->buft );
@@ -2207,16 +2207,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22072207 const int cc = ggml_cuda_info ().devices [id].cc ;
22082208 const int warp_size = ggml_cuda_info ().devices [id].warp_size ;
22092209 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2210- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ], /* mul_mat_id=*/ false );
2211- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2210+ use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src0-> nb , src1->ne [1 ], /* mul_mat_id=*/ false );
2211+ use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src0-> nb , src1->ne [1 ]);
22122212 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
22132213 }
22142214 } else {
22152215 const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
22162216 const int warp_size = ggml_cuda_info ().devices [ctx.device ].warp_size ;
22172217 use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
2218- use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src1->ne [1 ], /* mul_mat_id=*/ false );
2219- use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src1->ne [1 ]);
2218+ use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf (src0->type , cc, warp_size, src0->ne , src0-> nb , src1->ne [1 ], /* mul_mat_id=*/ false );
2219+ use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf (src0->type , cc, src0->ne , src0-> nb , src1->ne [1 ]);
22202220 any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
22212221 }
22222222
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
22872287 return ;
22882288 }
22892289
2290- if (ggml_cuda_should_use_mmf (src0->type , cc, WARP_SIZE, src0->ne , src1->ne [2 ], /* mul_mat_id=*/ true )) {
2290+ if (ggml_cuda_should_use_mmf (src0->type , cc, WARP_SIZE, src0->ne , src0-> nb , src1->ne [2 ], /* mul_mat_id=*/ true )) {
22912291 ggml_cuda_mul_mat_f (ctx, src0, src1, ids, dst);
22922292 return ;
22932293 }
0 commit comments