-
Notifications
You must be signed in to change notification settings - Fork 12.1k
ggml-backend : fix async copy from CPU #8897
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, | |
} | ||
|
||
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { | ||
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst)); | ||
|
||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; | ||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; | ||
|
||
if (!ggml_backend_buffer_is_cuda(src->buffer)) { | ||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { | ||
return false; | ||
} | ||
|
||
if (!ggml_backend_buffer_is_cuda(dst->buffer)) { | ||
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) { | ||
return false; | ||
} | ||
|
||
// device -> device | ||
// device -> device copy | ||
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context; | ||
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context; | ||
|
||
if (backend_src != backend_dst) { | ||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; | ||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; | ||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context; | ||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context; | ||
|
||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); | ||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); | ||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { | ||
#ifndef NDEBUG | ||
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); | ||
#endif | ||
return false; | ||
} | ||
|
||
if (backend_src != backend_dst) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is it ensured that there are no race conditions between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What race conditions are you thinking about? It uses an event to synchronize the two streams. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I misinterpreted the code. If my understanding is correct the synchronization happens outside this function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Part of the synchronization is done in this function, but the most complicated parts happen in ggml_backend_sched. Ultimately, the only responsability of this function is to implement the semantics of the // asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); |
||
// copy on src stream | ||
if (cuda_ctx_src->device == cuda_ctx_dst->device) { | ||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); | ||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); | ||
} else { | ||
#ifdef GGML_CUDA_NO_PEER_COPY | ||
return false; | ||
|
@@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ | |
#endif | ||
} | ||
|
||
// record event on src stream | ||
// record event on src stream after the copy | ||
if (!cuda_ctx_src->copy_event) { | ||
ggml_cuda_set_device(cuda_ctx_src->device); | ||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); | ||
|
@@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ | |
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); | ||
} else { | ||
// src and dst are on the same backend | ||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); | ||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); | ||
} | ||
return true; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this synchronization call can be optimized out since with a null event the backend has already been synchronized. But if there is no measurable performance difference it may be better to just keep it in to make the code easier to understand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I left it there for clarity. For backends that don't support events,
ggml_backend_synchronize
should be a no-op anyway.