Skip to content

Commit

Permalink
cuda_op_mul_mat
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Sep 8, 2023
1 parent 624f06f commit dbddcd1
Showing 1 changed file with 323 additions and 4 deletions.
327 changes: 323 additions & 4 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5891,6 +5891,325 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
}
}

static void ggml_cuda_op_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
ggml_cuda_op_t op, bool src0_needs_f32) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nrows0 = ggml_nrows(src0);

const bool use_src1 = src1 != nullptr;
const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;

GGML_ASSERT(ne03 == ne13);

const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];

const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];

GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);

// strides for iteration over dims 3 and 2
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
const int64_t num_iters = num_iters_0;
const int64_t stride_mod = 1;
const int64_t src0_stride = ne00 * ne01 * stride_mod;
const int64_t src1_stride = ne10 * ne11 * stride_mod;
const int64_t dst_stride = ne0 * ne1 * stride_mod;

const int64_t rows_per_iter = ne01;
const int64_t i03_max = ne03;
const int64_t i02_max = (ne02 >= ne12 ? ne02 : ne12);
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;

const size_t src0_ts = ggml_type_size(src0->type);
const size_t src0_bs = ggml_blck_size(src0->type);

struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;

const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0);
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;

const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
const bool src1_stays_on_host = use_src1 && (
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);

const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
GGML_ASSERT(!(split && ne02 < ne12));

const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);

// dd = data device
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};

// asq = actual size quantized, asf = actual size float
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};

// if multiple devices are used they need to wait for the main device
// here an event is recorded that signifies that the main device has finished calculating the input data
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
}

for (int id = 0; id < g_device_count; ++id) {
if (!split && id != g_main_device) {
continue;
}

const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;

int64_t row_low, row_high;
if (split) {
const int64_t rounding = get_row_rounding(src0->type);

row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
row_low -= row_low % rounding;

if (id == g_device_count - 1) {
row_high = nrows0;
} else {
row_high = nrows0*g_tensor_split[id + 1];
row_high -= row_high % rounding;
}
} else {
row_low = 0;
row_high = nrows0*i02_divisor;
}
if (row_low == row_high) {
continue;
}

int64_t row_diff = row_high - row_low;

cudaSetDevice(id);
cudaStream_t cudaStream_main = g_cudaStreams_main[id];

// wait for main GPU data if necessary
if (split && id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
}

if (src0_on_device && src0_is_contiguous) {
if (src0_is_f32) {
src0_ddf[id] = (float *) src0_extra->data_device[id];
} else {
src0_ddq[id] = (char *) src0_extra->data_device[id];
}
} else {
if (src0_is_f32) {
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
} else {
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
}
}

if (src0_needs_f32 && !src0_is_f32) {
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
}

if (use_src1 && !src1_stays_on_host) {
if (src1_on_device && src1_is_contiguous) {
src1_ddf[id] = (float *) src1_extra->data_device[id];
} else {
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
}
}
if (dst_on_device) {
dst_ddf[id] = (float *) dst_extra->data_device[id];
} else {
size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
}

for (int64_t i03 = 0; i03 < i03_max; i03++) {
const int64_t i13 = i03 % ne13;
for (int64_t i02 = 0; i02 < i02_max; i02++) {
const int64_t i12 = i02 % ne12;

const int64_t i0 = i03*i02_max + i02;

// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
const int64_t i0_offset_low = row_low/rows_per_iter;
const int64_t i0_offset_high = row_high/rows_per_iter;

int64_t i01_low = 0;
int64_t i01_high = rows_per_iter;
if (split) {
if (i0 < i0_offset_low || i0 > i0_offset_high) {
continue;
}
if (i0 == i0_offset_low) {
i01_low = row_low % rows_per_iter;
}
if (i0 == i0_offset_high) {
i01_high = row_high % rows_per_iter;
}
}

// There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
// Removing the first assert or changing the order of the arguments causes the second assert to fail.
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);

const int64_t i01_diff = i01_high - i01_low;
if (i01_diff == 0) {
continue;
}
const int64_t i11 = i13*ne12 + i12;

// for split tensors the data begins at i0 == i0_offset_low
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;

// for split tensors the data pointer needs to be rounded down
// to the bin edge for i03, i02 bins beyond the first
if (i0 - i0_offset_low > 0) {
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
src0_ddf_i -= (row_low % ne01)*ne00;
dst_ddf_i -= (row_low % ne0)*ne1;
}

// the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
dst_ddf_i += i01_low; // offset is 0 if no tensor split
}

// copy src0, src1 to device if necessary
if (use_src1 && !src1_stays_on_host) {
if (src1->backend == GGML_BACKEND_CPU) {
int64_t nrows1 = ne11;
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
if (id != g_main_device) {
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
src1_ddf_i_source += i11*src1_stride;
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
cudaMemcpyDeviceToDevice, cudaStream_main));
}
} else if (src1_on_device && !src1_is_contiguous) {
GGML_ASSERT(!split);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
} else {
GGML_ASSERT(false);
}
}

if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
if (src0_is_f32) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
} else {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
}
}

// convert src0 to f32 if it is necessary for the ggml_cuda_op
if (src0_needs_f32 && !src0_is_f32) {
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
}

// do the computation
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
CUDA_CHECK(cudaGetLastError());

// copy dst to host or other device if necessary
if (!dst_on_device) {
void * dst_off_device;
cudaMemcpyKind kind;
if (dst->backend == GGML_BACKEND_CPU) {
dst_off_device = dst->data;
kind = cudaMemcpyDeviceToHost;
} else if (dst->backend == GGML_BACKEND_GPU) {
dst_off_device = dst_extra->data_device[g_main_device];
kind = cudaMemcpyDeviceToDevice;
} else {
GGML_ASSERT(false);
}
if (split) {
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
// Instead they need to be copied to the correct slice in ne0 = dst row index.
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
}
}

// signify to main device that other device is done
if (split && g_device_count > 1 && id != g_main_device) {
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
}
}
}
}

// wait until each device is finished, then free their buffers
for (int id = 0; id < g_device_count; ++id) {
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
continue;
}

CUDA_CHECK(cudaSetDevice(id));

if (src0_asq[id] > 0) {
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
}
if (src0_asf[id] > 0) {
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
}
if (src1_asf[id] > 0) {
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
}
if (dst_asf[id] > 0) {
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
}
}

// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id != g_main_device && src0_extra->events[id]) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
}
}
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
}
}

static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
const int64_t ne00 = src0->ne[0];
Expand Down Expand Up @@ -6327,10 +6646,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
}else if (src0->type == GGML_TYPE_F32) {
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false);
} else {
int min_compute_capability = INT_MAX;
for (int id = 0; id < g_device_count; ++id) {
Expand All @@ -6341,9 +6660,9 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
}

if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, false);
} else {
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
}
}
} else {
Expand Down

0 comments on commit dbddcd1

Please sign in to comment.