Skip to content

Commit d30667b

Browse files
authored
bugfix: do not use non-blocking copy for gpu to cpu transfer (#564)
If we use async GPU to CPU copy in `plan` functions, we synchronize before we use the cpu array. Since we have removed synchronization in plan functions, the GPU to CPU copy should be synchronized. For flashinfer v0.2, it's encouraged to pass cpu indptr arrays to `plan` functions, and the synchronized GPU to CPU copy will be a no-op in this case.
1 parent 4800368 commit d30667b

File tree

3 files changed

+6
-6
lines changed

3 files changed

+6
-6
lines changed

python/flashinfer/decode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,7 @@ def plan(
719719
)
720720
self._qo_indptr_buf = qo_indptr_host.to(self.device, non_blocking=True)
721721

722-
indptr_host = indptr.to("cpu", non_blocking=True)
722+
indptr_host = indptr.to("cpu")
723723
if data_type is not None:
724724
q_data_type = data_type
725725
kv_data_type = data_type

python/flashinfer/prefill.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,8 +1004,8 @@ def plan(
10041004
self._qk_indptr_buf = qk_indptr.to(self.device, non_blocking=True)
10051005

10061006
# NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors
1007-
qo_indptr_host = qo_indptr.to("cpu", non_blocking=True)
1008-
paged_kv_indptr_host = paged_kv_indptr.to("cpu", non_blocking=True)
1007+
qo_indptr_host = qo_indptr.to("cpu")
1008+
paged_kv_indptr_host = paged_kv_indptr.to("cpu")
10091009

10101010
if packed_custom_mask is not None:
10111011
mask_mode = MaskMode.CUSTOM.value
@@ -1571,8 +1571,8 @@ def plan(
15711571
self._qk_indptr_buf = qk_indptr.to(self.device)
15721572

15731573
# NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors
1574-
qo_indptr_host = qo_indptr.to("cpu", non_blocking=True)
1575-
kv_indptr_host = kv_indptr.to("cpu", non_blocking=True)
1574+
qo_indptr_host = qo_indptr.to("cpu")
1575+
kv_indptr_host = kv_indptr.to("cpu")
15761576

15771577
if packed_custom_mask is not None:
15781578
mask_mode = MaskMode.CUSTOM.value

python/flashinfer/sparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def plan(
298298
self.R = R
299299
self.C = C
300300

301-
kv_indptr_host = indptr.to("cpu", non_blocking=True)
301+
kv_indptr_host = indptr.to("cpu")
302302

303303
# NOTE(Zihao): we haven't supported mask in cuda-core implementations but it should
304304
# be easy to add support for it if needed, leave it as a future work.

0 commit comments

Comments
 (0)