Skip to content

Commit

Permalink
cuda : fix disabling device with --tensor-split 1,0 (ggerganov#3951)
Browse files Browse the repository at this point in the history
Co-authored-by: slaren <slarengh@gmail.com>
  • Loading branch information
cebtenzzre and slaren authored Nov 5, 2023
1 parent 3d48f42 commit 132d25b
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
int64_t row_low[GGML_CUDA_MAX_DEVICES];
int64_t row_high[GGML_CUDA_MAX_DEVICES];

int used_devices = 0;

for (int64_t id = 0; id < g_device_count; ++id) {
// by default, use all rows
row_low[id] = 0;
Expand Down Expand Up @@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
continue;
}

used_devices++;

const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;

Expand Down Expand Up @@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(

// if multiple devices are used they need to wait for the main device
// here an event is recorded that signals that the main device has finished calculating the input data
if (split && g_device_count > 1) {
if (split && used_devices > 1) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
}

const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
Expand Down Expand Up @@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
}

for (int64_t id = 0; id < g_device_count; ++id) {
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
continue;
}
CUDA_CHECK(ggml_cuda_set_device(id));

// free buffers again when done
Expand All @@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(

CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) {
if (row_low[id] == row_high[id]) {
continue;
}
for (int64_t is = 0; is < is_max; ++is) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
}
Expand Down Expand Up @@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device =
(src0->backend == GGML_BACKEND_GPU) &&
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_GPU) &&
( dst->backend == GGML_BACKEND_GPU);

Expand Down

0 comments on commit 132d25b

Please sign in to comment.