Skip to content

Commit 91523fb

Browse files
fix CUDA mul mat for V on CPU and bs % 32 != 0
1 parent 086c6d6 commit 91523fb

File tree

1 file changed

+7
-4
lines changed

1 file changed

+7
-4
lines changed

ggml-cuda.cu

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5609,7 +5609,7 @@ inline void ggml_cuda_op_mul_mat_q(
56095609
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
56105610
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
56115611

5612-
const int nchannels = buffers_contiguous ? 1 : ne02;
5612+
const int64_t nchannels = buffers_contiguous ? 1 : ne02;
56135613

56145614
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
56155615
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
@@ -5620,9 +5620,11 @@ inline void ggml_cuda_op_mul_mat_q(
56205620
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels,
56215621
src1_row_stride, src1_channel_stride, cudaStream_main);
56225622

5623-
const int row_stride = buffers_contiguous ? ne10 / ggml_blck_size(src0->type) : nb01 / ggml_type_size(src0->type);
5624-
const int channel_stride_x = buffers_contiguous ? ne10*ne11 / ggml_blck_size(src0->type) : nb02 / ggml_type_size(src0->type);
5625-
const int channel_stride_y = padded_row_size*ne11 / QK8_1;
5623+
const int64_t src0_blck_size = ggml_blck_size(src0->type);
5624+
const int64_t ne10_whole_blck = ne10 % src0_blck_size == 0 ? ne10 : ne10 - ne10 % src0_blck_size + src0_blck_size;
5625+
const int64_t row_stride = buffers_contiguous ? ne10_whole_blck / ggml_blck_size(src0->type) : nb01 / ggml_type_size(src0->type);
5626+
const int64_t channel_stride_x = buffers_contiguous ? ne10_whole_blck*ne11 / ggml_blck_size(src0->type) : nb02 / ggml_type_size(src0->type);
5627+
const int64_t channel_stride_y = padded_row_size*ne11 / QK8_1;
56265628

56275629
switch (src0->type) {
56285630
case GGML_TYPE_Q4_0:
@@ -6221,6 +6223,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
62216223
if (src0_is_f32) {
62226224
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
62236225
} else {
6226+
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
62246227
const int64_t nelements = row_diff*ne00;
62256228
const int64_t nelements_padded = ne00 % MATRIX_ROW_PADDING == 0 ?
62266229
nelements : nelements - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;

0 commit comments

Comments
 (0)