This seems to fix it.

Iwan Kawrakow · Iwan Kawrakow · commit ac1e3227dbd9 · 2025-05-13T17:10:05.000+03:00
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -2244,7 +2244,8 @@ static inline void prepare_row_mappigs(ggml_backend_cuda_context& ctx, int64_t n
 
     for (int i = 0; i < (int)n_as; ++i) cum_moe_counts[i] -= moe_counts[i];
 
-    CUDA_CHECK(cudaMemcpyAsync(dev_row_mapping.get(), rmapping.data(), cum_moe_counts[n_as]*sizeof(mmid_row_mapping), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(dev_row_mapping.get(), rmapping.data(),
+                cum_moe_counts[n_as]*sizeof(mmid_row_mapping), cudaMemcpyHostToDevice, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
 }
@@ -2254,6 +2255,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
     const ggml_tensor * src1 = dst->src[1];
     const ggml_tensor * ids  = dst->src[2];
 
+    CUDA_CHECK(cudaMemset((char *)dst->data, 0, ggml_nbytes(dst)));
+
     if (src1->ne[1] == 1 && src1->ne[2] == 1 && src1->ne[3] == 1 &&
         ggml_is_quantized(src0->type) &&
         ggml_backend_buffer_is_cuda(src0->buffer) &&
@@ -2519,13 +2522,16 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
                 auto local_src0 = *next->src[0];
                 local_src0.ne[2] = local_src0.ne[3] = 1;
 
+                CUDA_CHECK(cudaMemset(next->data, 0, ggml_nbytes(next)));
+
                 ggml_cuda_op_mul_mat_vec_q_id(ctx, &local_src0, &local_src1, ids, &local_next,
                     (const char *)next->src[0]->data, nullptr, dst_quantized.get(), (float *)next->data,
                     0, next->src[0]->ne[1], 1, dst_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
 
                 return true;
             } else {
+                CUDA_CHECK(cudaMemset(dst->data, 0, ggml_nbytes(dst)));
                 ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
                         (const float *)dst_gate_contiguous.get(), (const float *)dst_up_contiguous.get(), (float *)dst->data);
                 CUDA_CHECK(cudaGetLastError());
@@ -2534,7 +2540,6 @@ static bool ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
         }
     }
 
-
     GGML_TENSOR_BINARY_OP_LOCALS
 
     GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0_1->buffer) && "mul_mat_id does not support split buffers");
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -150,20 +150,21 @@ static __global__ void mul_mat_vec_q(
     char * cdst = (char *)dst + i2*nb2;
     int i02 = ids_data ? *(const int *)(ids_data + i2*ids_nb0) : i2;
     if (i02 < 0) {
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
-        constexpr int rows_per_cuda_block = 1;
-#else
-        constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
-        const int row0 = rows_per_cuda_block*blockIdx.x;
-        if (threadIdx.y == 0) {
-            dst = (float *)cdst;
-            for (int j = 0; j < ncols_y; ++j) {
-                if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
-                    dst[j*nrows_dst + row0 + threadIdx.x] = 0;
-                }
-            }
-        }
+        // We clar the buffer via cudaMemset instead
+//#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+//        constexpr int rows_per_cuda_block = 1;
+//#else
+//        constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
+//#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+//        const int row0 = rows_per_cuda_block*blockIdx.x;
+//        if (threadIdx.y == 0) {
+//            dst = (float *)cdst;
+//            for (int j = 0; j < ncols_y; ++j) {
+//                if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
+//                    dst[j*nrows_dst + row0 + threadIdx.x] = 0;
+//                }
+//            }
+//        }
         return;
     }
     const char * cx = (const char *)vx + i02*nb02;