Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : group all experts in a single ggml_mul_mat_id #6505

Merged
merged 22 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cuda : fix bin bcast with non-cont src0
  • Loading branch information
slaren committed Apr 17, 2024
commit d68c935c8d56603d751a6708c1803d5931b66954
56 changes: 42 additions & 14 deletions ggml-cuda/binbcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s01,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) {
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
Expand All @@ -36,9 +37,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;

const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i_src0;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;

const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
Expand All @@ -55,6 +56,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s01,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) {

const int i = blockDim.x*blockIdx.x + threadIdx.x;
Expand All @@ -72,9 +74,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;

const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i_src0;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;

const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
Expand Down Expand Up @@ -118,17 +120,20 @@ struct bin_bcast_cuda {
cnb[3] *= cne[3];
};

for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
}
if (i > 0) {
collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1);
collapse(cne0);
collapse(cne1);
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
}
if (i > 0) {
collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1);
collapse(cne0);
collapse(cne1);
}
}
}

{
int64_t ne0 = cne0[0];
int64_t ne1 = cne0[1];
Expand Down Expand Up @@ -160,7 +165,28 @@ struct bin_bcast_cuda {
size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t);

size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);

GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

GGML_ASSERT(s0 == 1);
GGML_ASSERT(s00 == 1);
GGML_ASSERT(s10 == 1);

const int block_size = 128;
Expand All @@ -179,20 +205,22 @@ struct bin_bcast_cuda {
);

if (block_nums.z > 65535) {
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
// this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13);
} else {
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13);
}
}
Expand Down
8 changes: 5 additions & 3 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6171,16 +6171,18 @@ static struct ggml_tensor * llm_build_moe_ffn(
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]);

// FIXME: non-contiguous add broken in cuda
cur_expert = ggml_cont(ctx, cur_expert);

if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx, moe_out, cur_expert);
}
}

if (n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx, moe_out);
}

return moe_out;
}

Expand Down