@@ -2244,7 +2244,8 @@ static inline void prepare_row_mappigs(ggml_backend_cuda_context& ctx, int64_t n
22442244
22452245 for (int i = 0 ; i < (int )n_as; ++i) cum_moe_counts[i] -= moe_counts[i];
22462246
2247- CUDA_CHECK (cudaMemcpyAsync (dev_row_mapping.get (), rmapping.data (), cum_moe_counts[n_as]*sizeof (mmid_row_mapping), cudaMemcpyHostToDevice, stream));
2247+ CUDA_CHECK (cudaMemcpyAsync (dev_row_mapping.get (), rmapping.data (),
2248+ cum_moe_counts[n_as]*sizeof (mmid_row_mapping), cudaMemcpyHostToDevice, stream));
22482249 CUDA_CHECK (cudaStreamSynchronize (stream));
22492250
22502251}
@@ -2254,6 +2255,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
22542255 const ggml_tensor * src1 = dst->src [1 ];
22552256 const ggml_tensor * ids = dst->src [2 ];
22562257
2258+ CUDA_CHECK (cudaMemset ((char *)dst->data , 0 , ggml_nbytes (dst)));
2259+
22572260 if (src1->ne [1 ] == 1 && src1->ne [2 ] == 1 && src1->ne [3 ] == 1 &&
22582261 ggml_is_quantized (src0->type ) &&
22592262 ggml_backend_buffer_is_cuda (src0->buffer ) &&
@@ -2519,13 +2522,16 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
25192522 auto local_src0 = *next->src [0 ];
25202523 local_src0.ne [2 ] = local_src0.ne [3 ] = 1 ;
25212524
2525+ CUDA_CHECK (cudaMemset (next->data , 0 , ggml_nbytes (next)));
2526+
25222527 ggml_cuda_op_mul_mat_vec_q_id (ctx, &local_src0, &local_src1, ids, &local_next,
25232528 (const char *)next->src [0 ]->data , nullptr , dst_quantized.get (), (float *)next->data ,
25242529 0 , next->src [0 ]->ne [1 ], 1 , dst_padded_col_size, stream);
25252530 CUDA_CHECK (cudaGetLastError ());
25262531
25272532 return true ;
25282533 } else {
2534+ CUDA_CHECK (cudaMemset (dst->data , 0 , ggml_nbytes (dst)));
25292535 ggml_fused_mul_unary (ctx, (ggml_unary_op)dst->op_params [0 ], ggml_nelements (dst),
25302536 (const float *)dst_gate_contiguous.get (), (const float *)dst_up_contiguous.get (), (float *)dst->data );
25312537 CUDA_CHECK (cudaGetLastError ());
@@ -2534,7 +2540,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
25342540 }
25352541 }
25362542
2537-
25382543 GGML_TENSOR_BINARY_OP_LOCALS
25392544
25402545 GGML_ASSERT (!ggml_backend_buffer_is_cuda_split (src0_1->buffer ) && " mul_mat_id does not support split buffers" );
0 commit comments