We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent a818f30 commit 3b099bcCopy full SHA for 3b099bc
ggml-cuda/mmq.cuh
@@ -2054,15 +2054,13 @@ static __device__ __forceinline__ void mmq_write_back_mma(
2054
static_assert(nwarps*mma_C::I == mmq_y, "nwarps*mma_C::I != mmq_y");
2055
#endif // INT8_MMA_AVAILABLE
2056
2057
- dst += (threadIdx.y % ntx) * mma_C::J*stride;
2058
-
2059
#pragma unroll
2060
for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
2061
2062
for (int n = 0; n < ntx; ++n) {
2063
2064
for (int l = 0; l < mma_C::ne; ++l) {
2065
- const int j = j0 + mma_C::get_j(l);
+ const int j = j0 + (threadIdx.y % ntx) * mma_C::J + mma_C::get_j(l);
2066
2067
if (j > j_max) {
2068
continue;
0 commit comments